Support building SimpleDMatrix from Arrow data format (#7512)

* Integrate with Arrow C data API. * Support Arrow dataset. * Support Arrow table. Co-authored-by: Xiaochang Wu <xiaochang.wu@intel.com> Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com> Co-authored-by: Zhang Zhang <zhang.zhang@intel.com>
2022-03-14 22:25:19 -07:00
parent 6b6849b001
commit 613ec36c5a
14 changed files with 732 additions and 10 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -416,6 +416,27 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
  API_END();
 }

+XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
+                                     void *ptr_schema) {
+  API_BEGIN();
+  static_cast<data::RecordBatchesIterAdapter *>(data_handle)
+      ->SetData(static_cast<struct ArrowArray *>(ptr_array),
+                static_cast<struct ArrowSchema *>(ptr_schema));
+  API_END();
+}
+
+XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *json_config,
+                                             DMatrixHandle *out) {
+  API_BEGIN();
+  auto config = Json::Load(StringView{json_config});
+  auto missing = GetMissing(config);
+  int32_t n_threads = get<Integer const>(config["nthread"]);
+  n_threads = common::OmpGetNumThreads(n_threads);
+  data::RecordBatchesIterAdapter adapter(next, n_threads);
+  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  API_END();
+}
+
 XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
                                  const int* idxset,
                                  xgboost::bst_ulong len,
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -13,6 +13,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <map>
+#include <algorithm>

 #include "xgboost/logging.h"
 #include "xgboost/base.h"
@@ -22,6 +24,7 @@
 #include "array_interface.h"
 #include "../c_api/c_api_error.h"
 #include "../common/math.h"
+#include "arrow-cdi.h"

 namespace xgboost {
 namespace data {
@@ -676,11 +679,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
 template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
 class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
 public:
-  IteratorAdapter(DataIterHandle data_handle,
-                  XGBCallbackDataIterNext* next_callback)
-      :  columns_{data::kAdapterUnknownSize}, row_offset_{0},
-         at_first_(true),
-         data_handle_(data_handle), next_callback_(next_callback) {}
+  IteratorAdapter(DataIterHandle data_handle, XGBCallbackDataIterNext* next_callback)
+      : columns_{data::kAdapterUnknownSize},
+        data_handle_(data_handle),
+        next_callback_(next_callback) {}

  // override functions
  void BeforeFirst() override {
@@ -766,9 +768,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
  std::vector<dmlc::real_t> value_;

  size_t columns_;
-  size_t row_offset_;
+  size_t row_offset_{0};
  // at the beginning.
-  bool at_first_;
+  bool at_first_{true};
  // handle to the iterator,
  DataIterHandle data_handle_;
  // call back to get the data.
@@ -777,6 +779,358 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
  dmlc::RowBlock<uint32_t> block_;
  std::unique_ptr<FileAdapterBatch> batch_;
 };
+
+enum ColumnDType : uint8_t {
+  kUnknown,
+  kInt8,
+  kUInt8,
+  kInt16,
+  kUInt16,
+  kInt32,
+  kUInt32,
+  kInt64,
+  kUInt64,
+  kFloat,
+  kDouble
+};
+
+class Column {
+ public:
+  Column() = default;
+
+  Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
+    : col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
+
+  virtual ~Column() = default;
+
+  Column(const Column&) = delete;
+  Column& operator=(const Column&) = delete;
+  Column(Column&&) = delete;
+  Column& operator=(Column&&) = delete;
+
+  // whether the valid bit is set for this element
+  bool IsValid(size_t row_idx) const {
+    return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
+  }
+
+  virtual COOTuple GetElement(size_t row_idx) const = 0;
+
+  virtual bool IsValidElement(size_t row_idx) const = 0;
+
+  virtual std::vector<float> AsFloatVector() const = 0;
+
+  virtual std::vector<uint64_t> AsUint64Vector() const = 0;
+
+  size_t Length() const { return length_; }
+
+ protected:
+  size_t col_idx_;
+  size_t length_;
+  size_t null_count_;
+  const uint8_t* bitmap_;
+};
+
+// Only columns of primitive types are supported. An ArrowColumnarBatch is a
+// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
+// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
+// derive from the abstract class Column.
+template <typename T>
+class PrimitiveColumn : public Column {
+  static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
+
+ public:
+  PrimitiveColumn(size_t idx, size_t length, size_t null_count,
+                  const uint8_t* bitmap, const T* data, float missing)
+    : Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
+
+  COOTuple GetElement(size_t row_idx) const override {
+    CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
+    return { row_idx, col_idx_, IsValidElement(row_idx) ?
+                  static_cast<float>(data_[row_idx]) : kNaN };
+  }
+
+  bool IsValidElement(size_t row_idx) const override {
+    // std::isfinite needs to cast to double to prevent msvc report error
+    return IsValid(row_idx)
+            && std::isfinite(static_cast<double>(data_[row_idx]))
+            && static_cast<float>(data_[row_idx]) != missing_;
+  }
+
+  std::vector<float> AsFloatVector() const override {
+    CHECK(data_) << "Column is empty";
+    std::vector<float> fv(length_);
+    std::transform(data_, data_ + length_, fv.begin(),
+        [](T v) { return static_cast<float>(v); });
+    return fv;
+  }
+
+  std::vector<uint64_t> AsUint64Vector() const override {
+    CHECK(data_) << "Column is empty";
+    std::vector<uint64_t> iv(length_);
+    std::transform(data_, data_ + length_, iv.begin(),
+        [](T v) { return static_cast<uint64_t>(v); });
+    return iv;
+  }
+
+ private:
+  const T* data_;
+  float missing_;  // user specified missing value
+};
+
+struct ColumnarMetaInfo {
+  // data type of the column
+  ColumnDType type{ColumnDType::kUnknown};
+  // location of the column in an Arrow record batch
+  int64_t loc{-1};
+};
+
+struct ArrowSchemaImporter {
+  std::vector<ColumnarMetaInfo> columns;
+
+  // map Arrow format strings to types
+  static ColumnDType FormatMap(char const* format_str) {
+    CHECK(format_str) << "Format string cannot be empty";
+    switch (format_str[0]) {
+      case 'c':
+        return ColumnDType::kInt8;
+      case 'C':
+        return ColumnDType::kUInt8;
+      case 's':
+        return ColumnDType::kInt16;
+      case 'S':
+        return ColumnDType::kUInt16;
+      case 'i':
+        return ColumnDType::kInt32;
+      case 'I':
+        return ColumnDType::kUInt32;
+      case 'l':
+        return ColumnDType::kInt64;
+      case 'L':
+        return ColumnDType::kUInt64;
+      case 'f':
+        return ColumnDType::kFloat;
+      case 'g':
+        return ColumnDType::kDouble;
+      default:
+        CHECK(false) << "Column data type not supported by XGBoost";
+        return ColumnDType::kUnknown;
+    }
+  }
+
+  void Import(struct ArrowSchema *schema) {
+    if (schema) {
+      CHECK(std::string(schema->format) == "+s"); // NOLINT
+      CHECK(columns.empty());
+      for (auto i = 0; i < schema->n_children; ++i) {
+        std::string name{schema->children[i]->name};
+        ColumnDType type = FormatMap(schema->children[i]->format);
+        ColumnarMetaInfo col_info{type, i};
+        columns.push_back(col_info);
+      }
+      if (schema->release) {
+        schema->release(schema);
+      }
+    }
+  }
+};
+
+class ArrowColumnarBatch {
+ public:
+  ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
+    : rb_{rb}, schema_{schema} {
+    CHECK(rb_) << "Cannot import non-existent record batch";
+    CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
+  }
+
+  size_t Import(float missing) {
+    auto& infov = schema_->columns;
+    for (size_t i = 0; i < infov.size(); ++i) {
+      columns_.push_back(CreateColumn(i, infov[i], missing));
+    }
+
+    // Compute the starting location for every row in this batch
+    auto batch_size = rb_->length;
+    auto num_columns = columns_.size();
+    row_offsets_.resize(batch_size + 1, 0);
+    for (auto i = 0; i < batch_size; ++i) {
+      row_offsets_[i+1] = row_offsets_[i];
+      for (size_t j = 0; j < num_columns; ++j) {
+        if (GetColumn(j).IsValidElement(i)) {
+          row_offsets_[i+1]++;
+        }
+      }
+    }
+    // return number of elements in the batch
+    return row_offsets_.back();
+  }
+
+  ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
+  ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
+  ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
+  ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
+
+  virtual ~ArrowColumnarBatch() {
+    if (rb_ && rb_->release) {
+      rb_->release(rb_);
+      rb_ = nullptr;
+    }
+    columns_.clear();
+  }
+
+  size_t Size() const { return rb_ ? rb_->length : 0; }
+
+  size_t NumColumns() const { return columns_.size(); }
+
+  size_t NumElements() const { return row_offsets_.back(); }
+
+  const Column& GetColumn(size_t col_idx) const {
+    return *columns_[col_idx];
+  }
+
+  void ShiftRowOffsets(size_t batch_offset) {
+    std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
+        [=](size_t c) { return c + batch_offset; });
+  }
+
+  const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
+
+ private:
+  std::shared_ptr<Column> CreateColumn(size_t idx,
+                                      ColumnarMetaInfo info,
+                                      float missing) const {
+    if (info.loc < 0) {
+      return nullptr;
+    }
+
+    auto loc_in_batch = info.loc;
+    auto length = rb_->length;
+    auto null_count = rb_->null_count;
+    auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
+    auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
+    const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
+    const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
+
+    // if null_count is not computed, compute it here
+    if (null_count < 0) {
+      if (!bitmap) {
+        null_count = 0;
+      } else {
+        null_count = length;
+        for (auto i = 0; i < length; ++i) {
+          if (bitmap[i/8] & (1 << (i%8))) {
+            null_count--;
+          }
+        }
+      }
+    }
+
+    switch (info.type) {
+      case ColumnDType::kInt8:
+        return std::make_shared<PrimitiveColumn<int8_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const int8_t*>(data), missing);
+      case ColumnDType::kUInt8:
+        return std::make_shared<PrimitiveColumn<uint8_t>>(
+            idx, length, null_count, bitmap, data, missing);
+      case ColumnDType::kInt16:
+        return std::make_shared<PrimitiveColumn<int16_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const int16_t*>(data), missing);
+      case ColumnDType::kUInt16:
+        return std::make_shared<PrimitiveColumn<uint16_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const uint16_t*>(data), missing);
+      case ColumnDType::kInt32:
+        return std::make_shared<PrimitiveColumn<int32_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const int32_t*>(data), missing);
+      case ColumnDType::kUInt32:
+        return std::make_shared<PrimitiveColumn<uint32_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const uint32_t*>(data), missing);
+      case ColumnDType::kInt64:
+        return std::make_shared<PrimitiveColumn<int64_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const int64_t*>(data), missing);
+      case ColumnDType::kUInt64:
+        return std::make_shared<PrimitiveColumn<uint64_t>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const uint64_t*>(data), missing);
+      case ColumnDType::kFloat:
+        return std::make_shared<PrimitiveColumn<float>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const float*>(data), missing);
+      case ColumnDType::kDouble:
+        return std::make_shared<PrimitiveColumn<double>>(
+            idx, length, null_count, bitmap,
+            reinterpret_cast<const double*>(data), missing);
+      default:
+        return nullptr;
+    }
+  }
+
+  struct ArrowArray* rb_;
+  struct ArrowSchemaImporter* schema_;
+  std::vector<std::shared_ptr<Column>> columns_;
+  std::vector<size_t> row_offsets_;
+};
+
+using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
+class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
+ public:
+  RecordBatchesIterAdapter(XGDMatrixCallbackNext *next_callback,
+                          int nthread)
+    : next_callback_{next_callback},
+      nbatches_{nthread} {}
+
+  void BeforeFirst() override {
+    CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
+  }
+
+  bool Next() override {
+    batches_.clear();
+    while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
+      at_first_ = false;
+    }
+
+    if (batches_.size() > 0) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
+    // Schema is only imported once at the beginning, regardless how many
+    // baches are comming.
+    // But even schema is not imported we still need to release its C data
+    // exported from Arrow.
+    if (at_first_ && schema) {
+      schema_.Import(schema);
+    } else {
+      if (schema && schema->release) {
+        schema->release(schema);
+      }
+    }
+    if (rb) {
+      batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
+    }
+  }
+
+  const ArrowColumnarBatchVec& Value() const override {
+    return batches_;
+  }
+
+  size_t NumColumns() const { return schema_.columns.size(); }
+  size_t NumRows() const { return kAdapterUnknownSize; }
+
+ private:
+  XGDMatrixCallbackNext *next_callback_;
+  bool at_first_{true};
+  int nbatches_;
+  struct ArrowSchemaImporter schema_;
+  ArrowColumnarBatchVec batches_;
+};
 };  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_ADAPTER_H_
--- a/src/data/arrow-cdi.h
+++ b/src/data/arrow-cdi.h
@@ -0,0 +1,66 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARROW_FLAG_DICTIONARY_ORDERED 1
+#define ARROW_FLAG_NULLABLE 2
+#define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+struct ArrowSchema {
+  // Array type description
+  const char* format;
+  const char* name;
+  const char* metadata;
+  int64_t flags;
+  int64_t n_children;
+  struct ArrowSchema** children;
+  struct ArrowSchema* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowSchema*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+struct ArrowArray {
+  // Array data description
+  int64_t length;
+  int64_t null_count;
+  int64_t offset;
+  int64_t n_buffers;
+  int64_t n_children;
+  const void** buffers;
+  struct ArrowArray** children;
+  struct ArrowArray* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowArray*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1000,6 +1000,8 @@ template DMatrix *
 DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
                                      XGBoostBatchCSR> *adapter,
                float missing, int nthread, const std::string &cache_prefix);
+template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
+    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);

 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
  SparsePage transpose;
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -249,5 +249,70 @@ template SimpleDMatrix::SimpleDMatrix(
    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
        *adapter,
    float missing, int nthread);
+
+template <>
+SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) {
+  auto& offset_vec = sparse_page_->offset.HostVector();
+  auto& data_vec = sparse_page_->data.HostVector();
+  uint64_t total_batch_size = 0;
+  uint64_t total_elements = 0;
+
+  adapter->BeforeFirst();
+  // Iterate over batches of input data
+  while (adapter->Next()) {
+    auto& batches = adapter->Value();
+    size_t num_elements = 0;
+    size_t num_rows = 0;
+    // Import Arrow RecordBatches
+#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(nthread)
+    for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
+      num_elements += batches[i]->Import(missing);
+      num_rows += batches[i]->Size();
+    }
+    total_elements += num_elements;
+    total_batch_size += num_rows;
+    // Compute global offset for every row and starting row for every batch
+    std::vector<uint64_t> batch_offsets(batches.size());
+    for (size_t i = 0; i < batches.size(); ++i) {
+      if (i == 0) {
+        batch_offsets[i] = total_batch_size - num_rows;
+        batches[i]->ShiftRowOffsets(total_elements - num_elements);
+      } else {
+        batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
+        batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
+      }
+    }
+    // Pre-allocate DMatrix memory
+    data_vec.resize(total_elements);
+    offset_vec.resize(total_batch_size + 1);
+    // Copy data into DMatrix
+#pragma omp parallel num_threads(nthread)
+    {
+#pragma omp for nowait
+      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
+        size_t begin = batches[i]->RowOffsets()[0];
+        for (size_t k = 0; k < batches[i]->Size(); ++k) {
+          for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
+            auto element = batches[i]->GetColumn(j).GetElement(k);
+            if (!std::isnan(element.value)) {
+              data_vec[begin++] = Entry(element.column_idx, element.value);
+            }
+          }
+        }
+      }
+#pragma omp for nowait
+      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
+        auto& offsets = batches[i]->RowOffsets();
+        std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
+      }
+    }
+  }
+  // Synchronise worker columns
+  info_.num_col_ = adapter->NumColumns();
+  rabit::Allreduce<rabit::op::Max>(&info_.num_col_, 1);
+  info_.num_row_ = total_batch_size;
+  info_.num_nonzero_ = data_vec.size();
+  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
+}
 }  // namespace data
 }  // namespace xgboost