Ignore columnar alignment requirement. (#4928)

* Better error message for wrong type.
* Fix stride size.
This commit is contained in:
Jiaming Yuan 2019-10-13 06:41:43 -04:00 committed by GitHub
parent 05d4751540
commit 3d46bd0fa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 183 additions and 79 deletions

View File

@ -19,6 +19,7 @@
#endif // defined(__CUDACC__)
#include "xgboost/span.h"
#include "common.h"
namespace xgboost {
@ -84,17 +85,11 @@ struct BitFieldContainer {
XGBOOST_DEVICE BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
/*\brief Compute the size of needed memory allocation. The returned value is in terms
* of number of elements with `BitFieldContainer::value_type'.
*/
static size_t ComputeStorageSize(size_t size) {
auto pos = ToBitPos(size);
if (size < kValueSize) {
return 1;
}
if (pos.bit_pos != 0) {
return pos.int_pos + 2;
} else {
return pos.int_pos + 1;
}
return common::DivRoundUp(size, kValueSize);
}
#if defined(__CUDA_ARCH__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
}
};
// Format: <Direction>BitField<size of underlying type>, underlying type must be unsigned.
// Format: <Direction>BitField<size of underlying type in bits>, underlying type must be unsigned.
using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
using RBitField8 = BitFieldContainer<unsigned char, RBitsPolicy<unsigned char>>;
using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
#if defined(__CUDACC__)

View File

@ -35,7 +35,7 @@ struct ColumnarErrors {
return "Memory should be contigious.";
}
static char const* TypestrFormat() {
return "`typestr` should be of format <endian><type><size>.";
return "`typestr' should be of format <endian><type><size of type in bytes>.";
}
// Not supported in Apache Arrow.
static char const* BigEndian() {
@ -50,7 +50,7 @@ struct ColumnarErrors {
return str.c_str();
}
static char const* Version() {
return "Only version 1 of __cuda_array_interface__ is being supported.";
return "Only version 1 of `__cuda_array_interface__' is supported.";
}
static char const* ofType(std::string const& type) {
static std::string str;
@ -60,22 +60,6 @@ struct ColumnarErrors {
str += " type.";
return str.c_str();
}
static std::string UnknownTypeStr(std::string const& typestr) {
return "typestr from array interface: " + typestr + " is not supported.";
}
};
// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}
static std::string TypeStr(char c) {
switch (c) {
@ -89,12 +73,47 @@ class ArrayInterfaceHandler {
return "Unsigned integer";
case 'f':
return "Floating point";
case 'c':
return "Complex floating point";
case 'm':
return "Timedelta";
case 'M':
return "Datetime";
case 'O':
return "Object";
case 'S':
return "String";
case 'U':
return "Unicode";
case 'V':
return "Other";
default:
LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface.";
LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
<< "\nPlease verify the `__cuda_array_interface__' "
<< "of your input data complies to: "
<< "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
<< "\nOr open an issue.";
return "";
}
}
static std::string UnSupportedType(std::string const& typestr) {
return TypeStr(typestr.at(1)) + " is not supported.";
}
};
// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}
template <typename PtrType>
static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
if (obj.find("data") == obj.cend()) {
@ -110,30 +129,30 @@ class ArrayInterfaceHandler {
static void Validate(std::map<std::string, Json> const& array) {
if (array.find("version") == array.cend()) {
LOG(FATAL) << "Missing version field for array interface";
LOG(FATAL) << "Missing `version' field for array interface";
}
auto version = get<Integer const>(array.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();
if (array.find("typestr") == array.cend()) {
LOG(FATAL) << "Missing typestr field for array interface";
LOG(FATAL) << "Missing `typestr' field for array interface";
}
auto typestr = get<String const>(array.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
if (array.find("shape") == array.cend()) {
LOG(FATAL) << "Missing shape field for array interface";
LOG(FATAL) << "Missing `shape' field for array interface";
}
if (array.find("data") == array.cend()) {
LOG(FATAL) << "Missing data field for array interface";
LOG(FATAL) << "Missing `data' field for array interface";
}
}
// Find null mask (validity mask) field
// Mask object is also an array interface, but with different requirements.
static void ExtractMask(std::map<std::string, Json> const& column,
common::Span<RBitField8::value_type>* p_out) {
static size_t ExtractMask(std::map<std::string, Json> const &column,
common::Span<RBitField8::value_type> *p_out) {
auto& s_mask = *p_out;
if (column.find("mask") != column.cend()) {
auto const& j_mask = get<Object const>(column.at("mask"));
@ -143,24 +162,42 @@ class ArrayInterfaceHandler {
auto j_shape = get<Array const>(j_mask.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
"Length of validity mask must be a multiple of 8 bytes.";
int64_t size = get<Integer>(j_shape.at(0)) *
sizeof(unsigned char) / sizeof(RBitField8::value_type);
auto typestr = get<String const>(j_mask.at("typestr"));
// For now this is just 1, we can support different size of interger in mask.
int64_t const type_length = typestr.at(2) - 48;
/*
* shape represents how many bits is in the mask. (This is a grey area, don't be
* suprised if it suddently represents something else when supporting a new
* implementation). Quoting from numpy array interface:
*
* The shape of this object should be "broadcastable" to the shape of the original
* array.
*
* And that's the only requirement.
*/
int64_t const n_bits = get<Integer>(j_shape.at(0));
// The size of span required to cover all bits. Here with 8 bits bitfield, we
// assume 1 byte alignment.
int64_t const span_size = RBitField8::ComputeStorageSize(n_bits);
if (j_mask.find("strides") != j_mask.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
}
if (typestr.at(1) == 't') {
CHECK_EQ(typestr.at(2), '1') << "There can be only 1 bit in each entry of bitfield.";
CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield.";
} else if (typestr.at(1) == 'i') {
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
} else {
LOG(FATAL) << "mask must be of integer type or bit field type.";
}
// For now this is just 1
int64_t const type_length = typestr.at(2) - 48;
s_mask = {p_mask, size / type_length};
s_mask = {p_mask, span_size};
return n_bits;
}
return 0;
}
template <typename T>
@ -178,8 +215,8 @@ class ArrayInterfaceHandler {
if (column.find("strides") != column.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious();
}
auto length = get<Integer const>(j_shape.at(0));
@ -197,15 +234,22 @@ class ArrayInterfaceHandler {
foreign_col.size = s_data.size();
common::Span<RBitField8::value_type> s_mask;
ArrayInterfaceHandler::ExtractMask(column, &s_mask);
size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);
foreign_col.valid = RBitField8(s_mask);
if (s_mask.data()) {
CHECK_EQ(n_bits, foreign_col.data.size())
<< "Shape of bit mask doesn't match data shape. "
<< "XGBoost doesn't support internal broadcasting.";
}
return foreign_col;
}
};
#define DISPATCH_TYPE(__dispatched_func, __typestr, ...) { \
CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat(); \
if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') { \
__dispatched_func<float>(__VA_ARGS__); \
} else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') { \
@ -227,7 +271,7 @@ class ArrayInterfaceHandler {
} else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') { \
__dispatched_func<uint64_t>(__VA_ARGS__); \
} else { \
LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr); \
LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr); \
} \
}

View File

@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
// one copy seems easier.
this->info.num_nonzero_ = tmp_offset.back();
int device = this->page_.offset.DeviceIdx();
// Device is obtained and set in `CountValid'
int32_t const device = this->page_.offset.DeviceIdx();
this->page_.data.SetDevice(device);
this->page_.data.Resize(this->info.num_nonzero_);
auto s_data = this->page_.data.DeviceSpan();

View File

@ -16,9 +16,10 @@
#include <vector>
#include <limits>
#include "columnar.h"
namespace xgboost {
class Json;
namespace data {
/*!
* \brief The simplest form of data holder, can be used to create DMatrix.

View File

@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) {
}
}
TEST(BitField, StorageSize) {
size_t constexpr kElements { 16 };
size_t size = LBitField64::ComputeStorageSize(kElements);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(4);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(kElements);
ASSERT_EQ(2, size);
}
TEST(BitField, GPU_Set) {
dh::device_vector<LBitField64::value_type> storage;
uint32_t constexpr kBits = 128;

View File

@ -38,6 +38,15 @@ TEST(ArrayInterfaceHandler, Error) {
Json(Boolean(false))};
column["data"] = j_data;
EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj));
std::vector<Json> j_mask_shape {Json(Integer(static_cast<Integer::Int>(kRows - 1)))};
column["mask"] = Object();
column["mask"]["shape"] = j_mask_shape;
column["mask"]["data"] = j_data;
column["mask"]["typestr"] = String("<i1");
column["mask"]["version"] = Integer(static_cast<Integer::Int>(1));
// shape of mask and data doesn't match.
EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
}
template <typename T>
@ -47,7 +56,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
Json column { Object() };
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
d_data.resize(kRows);
for (size_t i = 0; i < d_data.size(); ++i) {
@ -66,6 +75,29 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
return column;
}
void TestDenseColumn(std::unique_ptr<data::SimpleCSRSource> const& source,
size_t n_rows, size_t n_cols) {
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
for (size_t i = 0; i < n_rows; i++) {
auto const idx = i * n_cols;
auto const e_0 = data.at(idx);
ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
ASSERT_EQ(e_0.index, 0); // feature 0
auto e_1 = data.at(idx+1);
ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
ASSERT_EQ(e_1.index, 1); // feature 1
}
ASSERT_EQ(offset.back(), n_rows * n_cols);
for (size_t i = 0; i < n_rows + 1; ++i) {
ASSERT_EQ(offset[i], i * n_cols);
}
ASSERT_EQ(source->info.num_row_, n_rows);
ASSERT_EQ(source->info.num_col_, n_cols);
}
TEST(SimpleCSRSource, FromColumnarDense) {
constexpr size_t kRows {16};
constexpr size_t kCols {2};
@ -85,25 +117,7 @@ TEST(SimpleCSRSource, FromColumnarDense) {
{
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), false);
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
for (size_t i = 0; i < kRows; i++) {
auto const idx = i * kCols;
auto const e_0 = data.at(idx);
ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
ASSERT_EQ(e_0.index, 0); // feature 0
auto e_1 = data.at(idx+1);
ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
ASSERT_EQ(e_1.index, 1); // feature 1
}
ASSERT_EQ(offset.back(), kRows * kCols);
for (size_t i = 0; i < kRows + 1; ++i) {
ASSERT_EQ(offset[i], i * kCols);
}
ASSERT_EQ(source->info.num_row_, kRows);
ASSERT_EQ(source->info.num_col_, kCols);
TestDenseColumn(source, kRows, kCols);
}
// with missing value specified
@ -145,9 +159,9 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
std::vector<Json> v_columns (kCols);
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
unsigned char constexpr kUCOne = 1;
RBitField8::value_type constexpr kUCOne = 1;
for (size_t i = 0; i < kCols; ++i) {
auto& col = v_columns[i];
@ -193,7 +207,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(16)))});
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
j_mask["typestr"] = String("|i1");
}
@ -220,10 +234,10 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
TEST(SimpleCSRSource, FromColumnarSparse) {
constexpr size_t kRows = 32;
constexpr size_t kCols = 2;
unsigned char constexpr kUCOne = 1;
RBitField8::value_type constexpr kUCOne = 1;
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
{
// column 0
@ -278,7 +292,7 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(8)))});
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
j_mask["typestr"] = String("|i1");
}
@ -348,4 +362,26 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
}
}
TEST(SimpleCSRSource, Types) {
// Test with different types of different size
constexpr size_t kRows {16};
constexpr size_t kCols {2};
std::vector<Json> columns;
thrust::device_vector<double> d_data_0(kRows);
thrust::device_vector<uint32_t> d_data_1(kRows);
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
Json column_arr {columns};
std::stringstream ss;
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str(), false);
TestDenseColumn(source, kRows, kCols);
}
} // namespace xgboost

View File

@ -69,3 +69,20 @@ Arrow specification.'''
with pytest.raises(Exception):
dtrain = xgb.DMatrix(cd, label=cd)
# Test when number of elements is less than 8
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
dtype=np.int32)})
dtrain = xgb.DMatrix(X)
assert dtrain.num_col() == 1
assert dtrain.num_row() == 5
# Boolean is not supported.
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
with pytest.raises(Exception):
dtrain = xgb.DMatrix(X_boolean)
y_boolean = cudf.DataFrame({
'x': cudf.Series([True, False, True, True, True])})
with pytest.raises(Exception):
dtrain = xgb.DMatrix(X_boolean, label=y_boolean)