Ignore columnar alignment requirement. (#4928)

* Better error message for wrong type. * Fix stride size.
2019-10-13 06:41:43 -04:00 · 2019-10-13 06:41:43 -04:00 · 3d46bd0fa5
commit 3d46bd0fa5
parent 05d4751540
7 changed files with 183 additions and 79 deletions
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@ -19,6 +19,7 @@
 #endif  // defined(__CUDACC__)

 #include "xgboost/span.h"
+#include "common.h"

 namespace xgboost {

@ -84,17 +85,11 @@ struct BitFieldContainer {
  XGBOOST_DEVICE BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
  XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}

+  /*\brief Compute the size of needed memory allocation.  The returned value is in terms
+   *       of number of elements with `BitFieldContainer::value_type'.
+   */
  static size_t ComputeStorageSize(size_t size) {
-    auto pos = ToBitPos(size);
-    if (size < kValueSize) {
-      return 1;
-    }
-
-    if (pos.bit_pos != 0) {
-      return pos.int_pos + 2;
-    } else {
-      return pos.int_pos + 1;
-    }
+    return common::DivRoundUp(size, kValueSize);
  }
 #if defined(__CUDA_ARCH__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
  }
 };

-// Format: <Direction>BitField<size of underlying type>, underlying type must be unsigned.
+// Format: <Direction>BitField<size of underlying type in bits>, underlying type must be unsigned.
 using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
-using RBitField8 = BitFieldContainer<unsigned char, RBitsPolicy<unsigned char>>;
+using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;

 #if defined(__CUDACC__)

--- a/src/data/columnar.h
+++ b/src/data/columnar.h
@ -35,7 +35,7 @@ struct ColumnarErrors {
    return "Memory should be contigious.";
  }
  static char const* TypestrFormat() {
-    return "`typestr` should be of format <endian><type><size>.";
+    return "`typestr' should be of format <endian><type><size of type in bytes>.";
  }
  // Not supported in Apache Arrow.
  static char const* BigEndian() {
@ -50,7 +50,7 @@ struct ColumnarErrors {
    return str.c_str();
  }
  static char const* Version() {
-    return "Only version 1 of __cuda_array_interface__ is being supported.";
+    return "Only version 1 of `__cuda_array_interface__' is supported.";
  }
  static char const* ofType(std::string const& type) {
    static std::string str;
@ -60,22 +60,6 @@ struct ColumnarErrors {
    str += " type.";
    return str.c_str();
  }
-  static std::string UnknownTypeStr(std::string const& typestr) {
-    return "typestr from array interface: " + typestr + " is not supported.";
-  }
-};
-
-// TODO(trivialfis): Abstract this into a class that accept a json
-// object and turn it into an array (for cupy and numba).
-class ArrayInterfaceHandler {
- public:
-  template <typename T>
-  static constexpr char TypeChar() {
-    return
-        (std::is_floating_point<T>::value ? 'f' :
-         (std::is_integral<T>::value ?
-          (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
-  }

  static std::string TypeStr(char c) {
    switch (c) {
@ -89,12 +73,47 @@ class ArrayInterfaceHandler {
        return "Unsigned integer";
      case 'f':
        return "Floating point";
+      case 'c':
+        return "Complex floating point";
+      case 'm':
+        return "Timedelta";
+      case 'M':
+        return "Datetime";
+      case 'O':
+        return "Object";
+      case 'S':
+        return "String";
+      case 'U':
+        return "Unicode";
+      case 'V':
+        return "Other";
      default:
-        LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface.";
+        LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
+                   << "\nPlease verify the `__cuda_array_interface__' "
+                   << "of your input data complies to: "
+                   << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
+                   << "\nOr open an issue.";
        return "";
    }
  }

+  static std::string UnSupportedType(std::string const& typestr) {
+    return TypeStr(typestr.at(1)) + " is not supported.";
+  }
+};
+
+// TODO(trivialfis): Abstract this into a class that accept a json
+// object and turn it into an array (for cupy and numba).
+class ArrayInterfaceHandler {
+ public:
+  template <typename T>
+  static constexpr char TypeChar() {
+    return
+        (std::is_floating_point<T>::value ? 'f' :
+         (std::is_integral<T>::value ?
+          (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
+  }
+
  template <typename PtrType>
  static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
    if (obj.find("data") == obj.cend()) {
@ -110,30 +129,30 @@ class ArrayInterfaceHandler {

  static void Validate(std::map<std::string, Json> const& array) {
    if (array.find("version") == array.cend()) {
-      LOG(FATAL) << "Missing version field for array interface";
+      LOG(FATAL) << "Missing `version' field for array interface";
    }
    auto version = get<Integer const>(array.at("version"));
    CHECK_EQ(version, 1) << ColumnarErrors::Version();

    if (array.find("typestr") == array.cend()) {
-      LOG(FATAL) << "Missing typestr field for array interface";
+      LOG(FATAL) << "Missing `typestr' field for array interface";
    }
    auto typestr = get<String const>(array.at("typestr"));
    CHECK_EQ(typestr.size(),    3) << ColumnarErrors::TypestrFormat();
    CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();

    if (array.find("shape") == array.cend()) {
-      LOG(FATAL) << "Missing shape field for array interface";
+      LOG(FATAL) << "Missing `shape' field for array interface";
    }
    if (array.find("data") == array.cend()) {
-      LOG(FATAL) << "Missing data field for array interface";
+      LOG(FATAL) << "Missing `data' field for array interface";
    }
  }

  // Find null mask (validity mask) field
  // Mask object is also an array interface, but with different requirements.
-  static void ExtractMask(std::map<std::string, Json> const& column,
-                             common::Span<RBitField8::value_type>* p_out) {
+  static size_t ExtractMask(std::map<std::string, Json> const &column,
+                            common::Span<RBitField8::value_type> *p_out) {
    auto& s_mask = *p_out;
    if (column.find("mask") != column.cend()) {
      auto const& j_mask = get<Object const>(column.at("mask"));
@ -143,24 +162,42 @@ class ArrayInterfaceHandler {

      auto j_shape = get<Array const>(j_mask.at("shape"));
      CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
-      CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
-          "Length of validity mask must be a multiple of 8 bytes.";
-      int64_t size = get<Integer>(j_shape.at(0)) *
-                     sizeof(unsigned char) / sizeof(RBitField8::value_type);
      auto typestr = get<String const>(j_mask.at("typestr"));
+      // For now this is just 1, we can support different size of interger in mask.
+      int64_t const type_length = typestr.at(2) - 48;
+      /*
+       * shape represents how many bits is in the mask. (This is a grey area, don't be
+       * suprised if it suddently represents something else when supporting a new
+       * implementation).  Quoting from numpy array interface:
+       *
+       *   The shape of this object should be "broadcastable" to the shape of the original
+       *   array.
+       *
+       * And that's the only requirement.
+       */
+      int64_t const n_bits = get<Integer>(j_shape.at(0));
+      // The size of span required to cover all bits.  Here with 8 bits bitfield, we
+      // assume 1 byte alignment.
+      int64_t const span_size = RBitField8::ComputeStorageSize(n_bits);
+
+      if (j_mask.find("strides") != j_mask.cend()) {
+        auto strides = get<Array const>(column.at("strides"));
+        CHECK_EQ(strides.size(),                        1) << ColumnarErrors::Dimension(1);
+        CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
+      }

      if (typestr.at(1) == 't') {
-        CHECK_EQ(typestr.at(2),   '1') << "There can be only 1 bit in each entry of bitfield.";
+        CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield.";
      } else if (typestr.at(1) == 'i') {
-        CHECK_EQ(typestr.at(2),   '1') << "mask with integer type should be of 1 byte per integer.";
+        CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
      } else {
        LOG(FATAL) << "mask must be of integer type or bit field type.";
      }

-      // For now this is just 1
-      int64_t const type_length = typestr.at(2) - 48;
-      s_mask = {p_mask, size / type_length};
+      s_mask = {p_mask, span_size};
+      return n_bits;
    }
+    return 0;
  }

  template <typename T>
@ -178,8 +215,8 @@ class ArrayInterfaceHandler {

    if (column.find("strides") != column.cend()) {
      auto strides = get<Array const>(column.at("strides"));
-      CHECK_EQ(strides.size(), 1)              << ColumnarErrors::Dimension(1);
-      CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
+      CHECK_EQ(strides.size(),         1)              << ColumnarErrors::Dimension(1);
+      CHECK_EQ(get<Integer>(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious();
    }

    auto length = get<Integer const>(j_shape.at(0));
@ -197,15 +234,22 @@ class ArrayInterfaceHandler {
    foreign_col.size  = s_data.size();

    common::Span<RBitField8::value_type> s_mask;
-    ArrayInterfaceHandler::ExtractMask(column, &s_mask);
+    size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);

    foreign_col.valid = RBitField8(s_mask);

+    if (s_mask.data()) {
+      CHECK_EQ(n_bits, foreign_col.data.size())
+          << "Shape of bit mask doesn't match data shape. "
+          << "XGBoost doesn't support internal broadcasting.";
+    }
+
    return foreign_col;
  }
 };

 #define DISPATCH_TYPE(__dispatched_func, __typestr, ...) {              \
+    CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat();   \
    if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') {             \
      __dispatched_func<float>(__VA_ARGS__);                            \
    } else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') {      \
@ -227,7 +271,7 @@ class ArrayInterfaceHandler {
    } else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') {      \
      __dispatched_func<uint64_t>(__VA_ARGS__);                         \
    } else {                                                            \
-      LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr);          \
+      LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr);         \
    }                                                                   \
  }

--- a/src/data/simple_csr_source.cu
+++ b/src/data/simple_csr_source.cu
@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
  // one copy seems easier.
  this->info.num_nonzero_ = tmp_offset.back();

-  int device = this->page_.offset.DeviceIdx();
+  // Device is obtained and set in `CountValid'
+  int32_t const device = this->page_.offset.DeviceIdx();
  this->page_.data.SetDevice(device);
  this->page_.data.Resize(this->info.num_nonzero_);
  auto s_data = this->page_.data.DeviceSpan();
--- a/src/data/simple_csr_source.h
+++ b/src/data/simple_csr_source.h
@ -16,9 +16,10 @@
 #include <vector>
 #include <limits>

-#include "columnar.h"
-
 namespace xgboost {
+
+class Json;
+
 namespace data {
 /*!
 * \brief The simplest form of data holder, can be used to create DMatrix.
--- a/tests/cpp/common/test_bitfield.cu
+++ b/tests/cpp/common/test_bitfield.cu
@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) {
  }
 }

+TEST(BitField, StorageSize) {
+  size_t constexpr kElements { 16 };
+  size_t size = LBitField64::ComputeStorageSize(kElements);
+  ASSERT_EQ(1, size);
+  size = RBitField8::ComputeStorageSize(4);
+  ASSERT_EQ(1, size);
+  size = RBitField8::ComputeStorageSize(kElements);
+  ASSERT_EQ(2, size);
+}
+
 TEST(BitField, GPU_Set) {
  dh::device_vector<LBitField64::value_type> storage;
  uint32_t constexpr kBits = 128;
--- a/tests/cpp/data/test_simple_csr_source.cu
+++ b/tests/cpp/data/test_simple_csr_source.cu
@ -38,6 +38,15 @@ TEST(ArrayInterfaceHandler, Error) {
            Json(Boolean(false))};
  column["data"] = j_data;
  EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj));
+
+  std::vector<Json> j_mask_shape {Json(Integer(static_cast<Integer::Int>(kRows - 1)))};
+  column["mask"] = Object();
+  column["mask"]["shape"] = j_mask_shape;
+  column["mask"]["data"] = j_data;
+  column["mask"]["typestr"] = String("<i1");
+  column["mask"]["version"] = Integer(static_cast<Integer::Int>(1));
+  // shape of mask and data doesn't match.
+  EXPECT_THROW(ArrayInterfaceHandler::ExtractArray<float>(column_obj), dmlc::Error);
 }

 template <typename T>
@ -47,7 +56,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
  Json column { Object() };
  std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
  column["shape"] = Array(j_shape);
-  column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
+  column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});

  d_data.resize(kRows);
  for (size_t i = 0; i < d_data.size(); ++i) {
@ -66,6 +75,29 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
  return column;
 }

+void TestDenseColumn(std::unique_ptr<data::SimpleCSRSource> const& source,
+                     size_t n_rows, size_t n_cols) {
+  auto const& data = source->page_.data.HostVector();
+  auto const& offset = source->page_.offset.HostVector();
+
+  for (size_t i = 0; i < n_rows; i++) {
+    auto const idx = i * n_cols;
+    auto const e_0 = data.at(idx);
+    ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
+    ASSERT_EQ(e_0.index, 0);  // feature 0
+
+    auto e_1 = data.at(idx+1);
+    ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
+    ASSERT_EQ(e_1.index, 1);  // feature 1
+  }
+  ASSERT_EQ(offset.back(), n_rows * n_cols);
+  for (size_t i = 0; i < n_rows + 1; ++i) {
+    ASSERT_EQ(offset[i], i * n_cols);
+  }
+  ASSERT_EQ(source->info.num_row_, n_rows);
+  ASSERT_EQ(source->info.num_col_, n_cols);
+}
+
 TEST(SimpleCSRSource, FromColumnarDense) {
  constexpr size_t kRows {16};
  constexpr size_t kCols {2};
@ -85,25 +117,7 @@ TEST(SimpleCSRSource, FromColumnarDense) {
  {
    std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
    source->CopyFrom(str.c_str(), false);
-
-    auto const& data = source->page_.data.HostVector();
-    auto const& offset = source->page_.offset.HostVector();
-    for (size_t i = 0; i < kRows; i++) {
-      auto const idx = i * kCols;
-      auto const e_0 = data.at(idx);
-      ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
-      ASSERT_EQ(e_0.index, 0);  // feature 0
-
-      auto e_1 = data.at(idx+1);
-      ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
-      ASSERT_EQ(e_1.index, 1);  // feature 1
-    }
-    ASSERT_EQ(offset.back(), kRows * kCols);
-    for (size_t i = 0; i < kRows + 1; ++i) {
-      ASSERT_EQ(offset[i], i * kCols);
-    }
-    ASSERT_EQ(source->info.num_row_, kRows);
-    ASSERT_EQ(source->info.num_col_, kCols);
+    TestDenseColumn(source, kRows, kCols);
  }

  // with missing value specified
@ -145,9 +159,9 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {

  std::vector<Json> v_columns (kCols);
  std::vector<dh::device_vector<float>> columns_data(kCols);
-  std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
+  std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);

-  unsigned char constexpr kUCOne = 1;
+  RBitField8::value_type constexpr kUCOne = 1;

  for (size_t i = 0; i < kCols; ++i) {
    auto& col = v_columns[i];
@ -193,7 +207,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
    j_mask["data"] = std::vector<Json>{
      Json(Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
      Json(Boolean(false))};
-    j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(16)))});
+    j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
    j_mask["typestr"] = String("|i1");
  }

@ -220,10 +234,10 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
 TEST(SimpleCSRSource, FromColumnarSparse) {
  constexpr size_t kRows = 32;
  constexpr size_t kCols = 2;
-  unsigned char constexpr kUCOne = 1;
+  RBitField8::value_type constexpr kUCOne = 1;

  std::vector<dh::device_vector<float>> columns_data(kCols);
-  std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
+  std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);

  {
    // column 0
@ -278,7 +292,7 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
    j_mask["data"] = std::vector<Json>{
      Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
      Json(Boolean(false))};
-    j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(8)))});
+    j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
    j_mask["typestr"] = String("|i1");
  }

@ -348,4 +362,26 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
  }
 }

+TEST(SimpleCSRSource, Types) {
+  // Test with different types of different size
+  constexpr size_t kRows {16};
+  constexpr size_t kCols {2};
+  std::vector<Json> columns;
+  thrust::device_vector<double> d_data_0(kRows);
+  thrust::device_vector<uint32_t> d_data_1(kRows);
+
+  columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
+  columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
+
+  Json column_arr {columns};
+
+  std::stringstream ss;
+  Json::Dump(column_arr, &ss);
+  std::string str = ss.str();
+
+  std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
+  source->CopyFrom(str.c_str(), false);
+  TestDenseColumn(source, kRows, kCols);
+}
+
 }  // namespace xgboost
--- a/tests/python-gpu/test_from_columnar.py
+++ b/tests/python-gpu/test_from_columnar.py
@ -69,3 +69,20 @@ Arrow specification.'''

        with pytest.raises(Exception):
            dtrain = xgb.DMatrix(cd, label=cd)
+
+        # Test when number of elements is less than 8
+        X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
+                                             dtype=np.int32)})
+        dtrain = xgb.DMatrix(X)
+        assert dtrain.num_col() == 1
+        assert dtrain.num_row() == 5
+
+        # Boolean is not supported.
+        X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
+        with pytest.raises(Exception):
+            dtrain = xgb.DMatrix(X_boolean)
+
+        y_boolean = cudf.DataFrame({
+            'x': cudf.Series([True, False, True, True, True])})
+        with pytest.raises(Exception):
+            dtrain = xgb.DMatrix(X_boolean, label=y_boolean)