- A `DeviceOrd` struct is implemented to indicate the device. It will eventually replace the `gpu_id` parameter. - The `predictor` parameter is removed. - Fallback to `DMatrix` when `inplace_predict` is not available. - The heuristic for choosing a predictor is only used during training.
1226 lines
38 KiB
C++
1226 lines
38 KiB
C++
/*!
|
|
* Copyright (c) 2019~2021 by Contributors
|
|
* \file adapter.h
|
|
*/
|
|
#ifndef XGBOOST_DATA_ADAPTER_H_
|
|
#define XGBOOST_DATA_ADAPTER_H_
|
|
#include <dmlc/data.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstddef> // for size_t
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <utility> // std::move
|
|
#include <vector>
|
|
|
|
#include "../c_api/c_api_error.h"
|
|
#include "../common/error_msg.h" // for MaxFeatureSize
|
|
#include "../common/math.h"
|
|
#include "array_interface.h"
|
|
#include "arrow-cdi.h"
|
|
#include "xgboost/base.h"
|
|
#include "xgboost/data.h"
|
|
#include "xgboost/logging.h"
|
|
#include "xgboost/span.h"
|
|
#include "xgboost/string_view.h"
|
|
|
|
namespace xgboost {
|
|
namespace data {
|
|
|
|
/** External data formats should implement an adapter as below. The
|
|
* adapter provides a uniform access to data outside xgboost, allowing
|
|
* construction of DMatrix objects from a range of sources without duplicating
|
|
* code.
|
|
*
|
|
* The adapter object is an iterator that returns batches of data. Each batch
|
|
* contains a number of "lines". A line represents a set of elements from a
|
|
* sparse input matrix, normally a row in the case of a CSR matrix or a column
|
|
* for a CSC matrix. Typically in sparse matrix formats we can efficiently
|
|
* access subsets of elements at a time, but cannot efficiently lookups elements
|
|
* by random access, hence the "line" abstraction, allowing the sparse matrix to
|
|
* return subsets of elements efficiently. Individual elements are described by
|
|
* a COO tuple (row index, column index, value).
|
|
*
|
|
* This abstraction allows us to read through different sparse matrix formats
|
|
* using the same interface. In particular we can write a DMatrix constructor
|
|
* that uses the same code to construct itself from a CSR matrix, CSC matrix,
|
|
* dense matrix, CSV, LIBSVM file, or potentially other formats. To see why this
|
|
* is necessary, imagine we have 5 external matrix formats and 5 internal
|
|
* DMatrix types where each DMatrix needs a custom constructor for each possible
|
|
* input. The number of constructors is 5*5=25. Using an abstraction over the
|
|
* input data types the number of constructors is reduced to 5, as each DMatrix
|
|
* is oblivious to the external data format. Adding a new input source is simply
|
|
* a case of implementing an adapter.
|
|
*
|
|
* Most of the below adapters do not need more than one batch as the data
|
|
* originates from an in memory source. The file adapter does require batches to
|
|
* avoid loading the entire file in memory.
|
|
*
|
|
* An important detail is empty row/column handling. Files loaded from disk do
|
|
* not provide meta information about the number of rows/columns to expect, this
|
|
* needs to be inferred during construction. Other sparse formats may specify a
|
|
* number of rows/columns, but we can encounter entirely sparse rows or columns,
|
|
* leading to disagreement between the inferred number and the meta-info
|
|
* provided. To resolve this, adapters have methods specifying the number of
|
|
* rows/columns expected, these methods may return zero where these values must
|
|
* be inferred from data. A constructed DMatrix should agree with the input
|
|
* source on numbers of rows/columns, appending empty rows if necessary.
|
|
* */
|
|
|
|
/** \brief An adapter can return this value for number of rows or columns
|
|
* indicating that this value is currently unknown and should be inferred while
|
|
* passing over the data. */
|
|
constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
|
|
|
|
struct COOTuple {
|
|
COOTuple() = default;
|
|
XGBOOST_DEVICE COOTuple(size_t row_idx, size_t column_idx, float value)
|
|
: row_idx(row_idx), column_idx(column_idx), value(value) {}
|
|
|
|
size_t row_idx{0};
|
|
size_t column_idx{0};
|
|
float value{0};
|
|
};
|
|
|
|
struct IsValidFunctor {
|
|
float missing;
|
|
|
|
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
|
|
|
XGBOOST_DEVICE bool operator()(float value) const {
|
|
return !(common::CheckNAN(value) || value == missing);
|
|
}
|
|
|
|
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
|
return !(common::CheckNAN(e.value) || e.value == missing);
|
|
}
|
|
|
|
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
|
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
|
}
|
|
};
|
|
|
|
namespace detail {
|
|
|
|
/**
|
|
* \brief Simplifies the use of DataIter when there is only one batch.
|
|
*/
|
|
template <typename DType>
|
|
class SingleBatchDataIter : dmlc::DataIter<DType> {
|
|
public:
|
|
void BeforeFirst() override { counter_ = 0; }
|
|
bool Next() override {
|
|
if (counter_ == 0) {
|
|
counter_++;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
int counter_{0};
|
|
};
|
|
|
|
/** \brief Indicates this data source cannot contain meta-info such as labels,
|
|
* weights or qid. */
|
|
class NoMetaInfo {
|
|
public:
|
|
const float* Labels() const { return nullptr; }
|
|
const float* Weights() const { return nullptr; }
|
|
const uint64_t* Qid() const { return nullptr; }
|
|
const float* BaseMargin() const { return nullptr; }
|
|
};
|
|
|
|
}; // namespace detail
|
|
|
|
class CSRAdapterBatch : public detail::NoMetaInfo {
|
|
public:
|
|
class Line {
|
|
public:
|
|
Line(size_t row_idx, size_t size, const unsigned* feature_idx,
|
|
const float* values)
|
|
: row_idx_(row_idx),
|
|
size_(size),
|
|
feature_idx_(feature_idx),
|
|
values_(values) {}
|
|
|
|
size_t Size() const { return size_; }
|
|
COOTuple GetElement(size_t idx) const {
|
|
return COOTuple{row_idx_, feature_idx_[idx], values_[idx]};
|
|
}
|
|
|
|
private:
|
|
size_t row_idx_;
|
|
size_t size_;
|
|
const unsigned* feature_idx_;
|
|
const float* values_;
|
|
};
|
|
CSRAdapterBatch(const size_t* row_ptr, const unsigned* feature_idx,
|
|
const float* values, size_t num_rows, size_t, size_t)
|
|
: row_ptr_(row_ptr),
|
|
feature_idx_(feature_idx),
|
|
values_(values),
|
|
num_rows_(num_rows) {}
|
|
const Line GetLine(size_t idx) const {
|
|
size_t begin_offset = row_ptr_[idx];
|
|
size_t end_offset = row_ptr_[idx + 1];
|
|
return Line(idx, end_offset - begin_offset, &feature_idx_[begin_offset],
|
|
&values_[begin_offset]);
|
|
}
|
|
size_t Size() const { return num_rows_; }
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
private:
|
|
const size_t* row_ptr_;
|
|
const unsigned* feature_idx_;
|
|
const float* values_;
|
|
size_t num_rows_;
|
|
};
|
|
|
|
class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
|
|
public:
|
|
CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx,
|
|
const float* values, size_t num_rows, size_t num_elements,
|
|
size_t num_features)
|
|
: batch_(row_ptr, feature_idx, values, num_rows, num_elements,
|
|
num_features),
|
|
num_rows_(num_rows),
|
|
num_columns_(num_features) {}
|
|
const CSRAdapterBatch& Value() const override { return batch_; }
|
|
size_t NumRows() const { return num_rows_; }
|
|
size_t NumColumns() const { return num_columns_; }
|
|
|
|
private:
|
|
CSRAdapterBatch batch_;
|
|
size_t num_rows_;
|
|
size_t num_columns_;
|
|
};
|
|
|
|
class DenseAdapterBatch : public detail::NoMetaInfo {
|
|
public:
|
|
DenseAdapterBatch(const float* values, size_t num_rows, size_t num_features)
|
|
: values_(values),
|
|
num_rows_(num_rows),
|
|
num_features_(num_features) {}
|
|
|
|
private:
|
|
class Line {
|
|
public:
|
|
Line(const float* values, size_t size, size_t row_idx)
|
|
: row_idx_(row_idx), size_(size), values_(values) {}
|
|
|
|
size_t Size() const { return size_; }
|
|
COOTuple GetElement(size_t idx) const {
|
|
return COOTuple{row_idx_, idx, values_[idx]};
|
|
}
|
|
|
|
private:
|
|
size_t row_idx_;
|
|
size_t size_;
|
|
const float* values_;
|
|
};
|
|
|
|
public:
|
|
size_t Size() const { return num_rows_; }
|
|
const Line GetLine(size_t idx) const {
|
|
return Line(values_ + idx * num_features_, num_features_, idx);
|
|
}
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
private:
|
|
const float* values_;
|
|
size_t num_rows_;
|
|
size_t num_features_;
|
|
};
|
|
|
|
class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
|
|
public:
|
|
DenseAdapter(const float* values, size_t num_rows, size_t num_features)
|
|
: batch_(values, num_rows, num_features),
|
|
num_rows_(num_rows),
|
|
num_columns_(num_features) {}
|
|
const DenseAdapterBatch& Value() const override { return batch_; }
|
|
|
|
size_t NumRows() const { return num_rows_; }
|
|
size_t NumColumns() const { return num_columns_; }
|
|
|
|
private:
|
|
DenseAdapterBatch batch_;
|
|
size_t num_rows_;
|
|
size_t num_columns_;
|
|
};
|
|
|
|
class ArrayAdapterBatch : public detail::NoMetaInfo {
|
|
public:
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
private:
|
|
ArrayInterface<2> array_interface_;
|
|
|
|
class Line {
|
|
ArrayInterface<2> array_interface_;
|
|
size_t ridx_;
|
|
|
|
public:
|
|
Line(ArrayInterface<2> array_interface, size_t ridx)
|
|
: array_interface_{std::move(array_interface)}, ridx_{ridx} {}
|
|
|
|
size_t Size() const { return array_interface_.Shape(1); }
|
|
|
|
COOTuple GetElement(size_t idx) const {
|
|
return {ridx_, idx, array_interface_(ridx_, idx)};
|
|
}
|
|
};
|
|
|
|
public:
|
|
ArrayAdapterBatch() = default;
|
|
Line const GetLine(size_t idx) const {
|
|
return Line{array_interface_, idx};
|
|
}
|
|
|
|
size_t NumRows() const { return array_interface_.Shape(0); }
|
|
size_t NumCols() const { return array_interface_.Shape(1); }
|
|
size_t Size() const { return this->NumRows(); }
|
|
|
|
explicit ArrayAdapterBatch(ArrayInterface<2> array_interface)
|
|
: array_interface_{std::move(array_interface)} {}
|
|
};
|
|
|
|
/**
|
|
* Adapter for dense array on host, in Python that's `numpy.ndarray`. This is similar to
|
|
* `DenseAdapter`, but supports __array_interface__ instead of raw pointers. An
|
|
* advantage is this can handle various data type without making a copy.
|
|
*/
|
|
class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
|
|
public:
|
|
explicit ArrayAdapter(StringView array_interface) {
|
|
auto j = Json::Load(array_interface);
|
|
array_interface_ = ArrayInterface<2>(get<Object const>(j));
|
|
batch_ = ArrayAdapterBatch{array_interface_};
|
|
}
|
|
[[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
|
|
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
|
|
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
|
|
|
|
private:
|
|
ArrayAdapterBatch batch_;
|
|
ArrayInterface<2> array_interface_;
|
|
};
|
|
|
|
class CSRArrayAdapterBatch : public detail::NoMetaInfo {
|
|
ArrayInterface<1> indptr_;
|
|
ArrayInterface<1> indices_;
|
|
ArrayInterface<1> values_;
|
|
bst_feature_t n_features_;
|
|
|
|
class Line {
|
|
ArrayInterface<1> indices_;
|
|
ArrayInterface<1> values_;
|
|
size_t ridx_;
|
|
size_t offset_;
|
|
|
|
public:
|
|
Line(ArrayInterface<1> indices, ArrayInterface<1> values, size_t ridx,
|
|
size_t offset)
|
|
: indices_{std::move(indices)}, values_{std::move(values)}, ridx_{ridx},
|
|
offset_{offset} {}
|
|
|
|
COOTuple GetElement(std::size_t idx) const {
|
|
return {ridx_, TypedIndex<std::size_t, 1>{indices_}(offset_ + idx), values_(offset_ + idx)};
|
|
}
|
|
|
|
size_t Size() const {
|
|
return values_.Shape(0);
|
|
}
|
|
};
|
|
|
|
public:
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
public:
|
|
CSRArrayAdapterBatch() = default;
|
|
CSRArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
|
|
ArrayInterface<1> values, bst_feature_t n_features)
|
|
: indptr_{std::move(indptr)},
|
|
indices_{std::move(indices)},
|
|
values_{std::move(values)},
|
|
n_features_{n_features} {
|
|
}
|
|
|
|
size_t NumRows() const {
|
|
size_t size = indptr_.Shape(0);
|
|
size = size == 0 ? 0 : size - 1;
|
|
return size;
|
|
}
|
|
size_t NumCols() const { return n_features_; }
|
|
size_t Size() const { return this->NumRows(); }
|
|
|
|
Line const GetLine(size_t idx) const {
|
|
auto begin_no_stride = TypedIndex<size_t, 1>{indptr_}(idx);
|
|
auto end_no_stride = TypedIndex<size_t, 1>{indptr_}(idx + 1);
|
|
|
|
auto indices = indices_;
|
|
auto values = values_;
|
|
// Slice indices and values, stride remains unchanged since this is slicing by
|
|
// specific index.
|
|
auto offset = indices.strides[0] * begin_no_stride;
|
|
|
|
indices.shape[0] = end_no_stride - begin_no_stride;
|
|
values.shape[0] = end_no_stride - begin_no_stride;
|
|
|
|
return Line{indices, values, idx, offset};
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Adapter for CSR array on host, in Python that's `scipy.sparse.csr_matrix`. This is
|
|
* similar to `CSRAdapter`, but supports __array_interface__ instead of raw pointers. An
|
|
* advantage is this can handle various data type without making a copy.
|
|
*/
|
|
class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch> {
|
|
public:
|
|
CSRArrayAdapter(StringView indptr, StringView indices, StringView values,
|
|
size_t num_cols)
|
|
: indptr_{indptr}, indices_{indices}, values_{values}, num_cols_{num_cols} {
|
|
batch_ = CSRArrayAdapterBatch{indptr_, indices_, values_,
|
|
static_cast<bst_feature_t>(num_cols_)};
|
|
}
|
|
|
|
CSRArrayAdapterBatch const& Value() const override {
|
|
return batch_;
|
|
}
|
|
size_t NumRows() const {
|
|
size_t size = indptr_.Shape(0);
|
|
size = size == 0 ? 0 : size - 1;
|
|
return size;
|
|
}
|
|
size_t NumColumns() const { return num_cols_; }
|
|
|
|
private:
|
|
CSRArrayAdapterBatch batch_;
|
|
ArrayInterface<1> indptr_;
|
|
ArrayInterface<1> indices_;
|
|
ArrayInterface<1> values_;
|
|
size_t num_cols_;
|
|
};
|
|
|
|
class CSCAdapterBatch : public detail::NoMetaInfo {
|
|
public:
|
|
CSCAdapterBatch(const size_t* col_ptr, const unsigned* row_idx,
|
|
const float* values, size_t num_features)
|
|
: col_ptr_(col_ptr),
|
|
row_idx_(row_idx),
|
|
values_(values),
|
|
num_features_(num_features) {}
|
|
|
|
private:
|
|
class Line {
|
|
public:
|
|
Line(size_t col_idx, size_t size, const unsigned* row_idx,
|
|
const float* values)
|
|
: col_idx_(col_idx), size_(size), row_idx_(row_idx), values_(values) {}
|
|
|
|
size_t Size() const { return size_; }
|
|
COOTuple GetElement(size_t idx) const {
|
|
return COOTuple{row_idx_[idx], col_idx_, values_[idx]};
|
|
}
|
|
|
|
private:
|
|
size_t col_idx_;
|
|
size_t size_;
|
|
const unsigned* row_idx_;
|
|
const float* values_;
|
|
};
|
|
|
|
public:
|
|
size_t Size() const { return num_features_; }
|
|
const Line GetLine(size_t idx) const {
|
|
size_t begin_offset = col_ptr_[idx];
|
|
size_t end_offset = col_ptr_[idx + 1];
|
|
return Line(idx, end_offset - begin_offset, &row_idx_[begin_offset],
|
|
&values_[begin_offset]);
|
|
}
|
|
static constexpr bool kIsRowMajor = false;
|
|
|
|
private:
|
|
const size_t* col_ptr_;
|
|
const unsigned* row_idx_;
|
|
const float* values_;
|
|
size_t num_features_;
|
|
};
|
|
|
|
class CSCAdapter : public detail::SingleBatchDataIter<CSCAdapterBatch> {
|
|
public:
|
|
CSCAdapter(const size_t* col_ptr, const unsigned* row_idx,
|
|
const float* values, size_t num_features, size_t num_rows)
|
|
: batch_(col_ptr, row_idx, values, num_features),
|
|
num_rows_(num_rows),
|
|
num_columns_(num_features) {}
|
|
const CSCAdapterBatch& Value() const override { return batch_; }
|
|
|
|
// JVM package sends 0 as unknown
|
|
size_t NumRows() const {
|
|
return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_;
|
|
}
|
|
size_t NumColumns() const { return num_columns_; }
|
|
|
|
private:
|
|
CSCAdapterBatch batch_;
|
|
size_t num_rows_;
|
|
size_t num_columns_;
|
|
};
|
|
|
|
class CSCArrayAdapterBatch : public detail::NoMetaInfo {
|
|
ArrayInterface<1> indptr_;
|
|
ArrayInterface<1> indices_;
|
|
ArrayInterface<1> values_;
|
|
bst_row_t n_rows_;
|
|
|
|
class Line {
|
|
std::size_t column_idx_;
|
|
ArrayInterface<1> row_idx_;
|
|
ArrayInterface<1> values_;
|
|
std::size_t offset_;
|
|
|
|
public:
|
|
Line(std::size_t idx, ArrayInterface<1> row_idx, ArrayInterface<1> values, std::size_t offset)
|
|
: column_idx_{idx},
|
|
row_idx_{std::move(row_idx)},
|
|
values_{std::move(values)},
|
|
offset_{offset} {}
|
|
|
|
std::size_t Size() const { return values_.Shape(0); }
|
|
COOTuple GetElement(std::size_t idx) const {
|
|
return {TypedIndex<std::size_t, 1>{row_idx_}(offset_ + idx), column_idx_,
|
|
values_(offset_ + idx)};
|
|
}
|
|
};
|
|
|
|
public:
|
|
static constexpr bool kIsRowMajor = false;
|
|
|
|
CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
|
|
ArrayInterface<1> values, bst_row_t n_rows)
|
|
: indptr_{std::move(indptr)},
|
|
indices_{std::move(indices)},
|
|
values_{std::move(values)},
|
|
n_rows_{n_rows} {}
|
|
|
|
std::size_t Size() const { return indptr_.n - 1; }
|
|
Line GetLine(std::size_t idx) const {
|
|
auto begin_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx);
|
|
auto end_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx + 1);
|
|
|
|
auto indices = indices_;
|
|
auto values = values_;
|
|
// Slice indices and values, stride remains unchanged since this is slicing by
|
|
// specific index.
|
|
auto offset = indices.strides[0] * begin_no_stride;
|
|
indices.shape[0] = end_no_stride - begin_no_stride;
|
|
values.shape[0] = end_no_stride - begin_no_stride;
|
|
|
|
return Line{idx, indices, values, offset};
|
|
}
|
|
};
|
|
|
|
/**
|
|
* \brief CSC adapter with support for array interface.
|
|
*/
|
|
class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch> {
|
|
ArrayInterface<1> indptr_;
|
|
ArrayInterface<1> indices_;
|
|
ArrayInterface<1> values_;
|
|
size_t num_rows_;
|
|
CSCArrayAdapterBatch batch_;
|
|
|
|
public:
|
|
CSCArrayAdapter(StringView indptr, StringView indices, StringView values, std::size_t num_rows)
|
|
: indptr_{indptr},
|
|
indices_{indices},
|
|
values_{values},
|
|
num_rows_{num_rows},
|
|
batch_{
|
|
CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
|
|
|
|
// JVM package sends 0 as unknown
|
|
size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }
|
|
size_t NumColumns() const { return indptr_.n - 1; }
|
|
const CSCArrayAdapterBatch& Value() const override { return batch_; }
|
|
};
|
|
|
|
class DataTableAdapterBatch : public detail::NoMetaInfo {
|
|
enum class DTType : std::uint8_t {
|
|
kFloat32 = 0,
|
|
kFloat64 = 1,
|
|
kBool8 = 2,
|
|
kInt32 = 3,
|
|
kInt8 = 4,
|
|
kInt16 = 5,
|
|
kInt64 = 6,
|
|
kUnknown = 7
|
|
};
|
|
|
|
static DTType DTGetType(std::string type_string) {
|
|
if (type_string == "float32") {
|
|
return DTType::kFloat32;
|
|
} else if (type_string == "float64") {
|
|
return DTType::kFloat64;
|
|
} else if (type_string == "bool8") {
|
|
return DTType::kBool8;
|
|
} else if (type_string == "int32") {
|
|
return DTType::kInt32;
|
|
} else if (type_string == "int8") {
|
|
return DTType::kInt8;
|
|
} else if (type_string == "int16") {
|
|
return DTType::kInt16;
|
|
} else if (type_string == "int64") {
|
|
return DTType::kInt64;
|
|
} else {
|
|
LOG(FATAL) << "Unknown data table type.";
|
|
return DTType::kUnknown;
|
|
}
|
|
}
|
|
|
|
public:
|
|
DataTableAdapterBatch(void const* const* const data, char const* const* feature_stypes,
|
|
std::size_t num_rows, std::size_t num_features)
|
|
: data_(data), num_rows_(num_rows) {
|
|
CHECK(feature_types_.empty());
|
|
std::transform(feature_stypes, feature_stypes + num_features,
|
|
std::back_inserter(feature_types_),
|
|
[](char const* stype) { return DTGetType(stype); });
|
|
}
|
|
|
|
private:
|
|
class Line {
|
|
std::size_t row_idx_;
|
|
void const* const* const data_;
|
|
std::vector<DTType> const& feature_types_;
|
|
|
|
float DTGetValue(void const* column, DTType dt_type, std::size_t ridx) const {
|
|
float missing = std::numeric_limits<float>::quiet_NaN();
|
|
switch (dt_type) {
|
|
case DTType::kFloat32: {
|
|
float val = reinterpret_cast<const float*>(column)[ridx];
|
|
return std::isfinite(val) ? val : missing;
|
|
}
|
|
case DTType::kFloat64: {
|
|
double val = reinterpret_cast<const double*>(column)[ridx];
|
|
return std::isfinite(val) ? static_cast<float>(val) : missing;
|
|
}
|
|
case DTType::kBool8: {
|
|
bool val = reinterpret_cast<const bool*>(column)[ridx];
|
|
return static_cast<float>(val);
|
|
}
|
|
case DTType::kInt32: {
|
|
int32_t val = reinterpret_cast<const int32_t*>(column)[ridx];
|
|
return val != (-2147483647 - 1) ? static_cast<float>(val) : missing;
|
|
}
|
|
case DTType::kInt8: {
|
|
int8_t val = reinterpret_cast<const int8_t*>(column)[ridx];
|
|
return val != -128 ? static_cast<float>(val) : missing;
|
|
}
|
|
case DTType::kInt16: {
|
|
int16_t val = reinterpret_cast<const int16_t*>(column)[ridx];
|
|
return val != -32768 ? static_cast<float>(val) : missing;
|
|
}
|
|
case DTType::kInt64: {
|
|
int64_t val = reinterpret_cast<const int64_t*>(column)[ridx];
|
|
return val != -9223372036854775807 - 1 ? static_cast<float>(val) : missing;
|
|
}
|
|
default: {
|
|
LOG(FATAL) << "Unknown data table type.";
|
|
return 0.0f;
|
|
}
|
|
}
|
|
}
|
|
|
|
public:
|
|
Line(std::size_t ridx, void const* const* const data, std::vector<DTType> const& ft)
|
|
: row_idx_{ridx}, data_{data}, feature_types_{ft} {}
|
|
std::size_t Size() const { return feature_types_.size(); }
|
|
COOTuple GetElement(std::size_t idx) const {
|
|
return COOTuple{row_idx_, idx, DTGetValue(data_[idx], feature_types_[idx], row_idx_)};
|
|
}
|
|
};
|
|
|
|
public:
|
|
size_t Size() const { return num_rows_; }
|
|
const Line GetLine(std::size_t ridx) const { return {ridx, data_, feature_types_}; }
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
private:
|
|
void const* const* const data_;
|
|
|
|
std::vector<DTType> feature_types_;
|
|
std::size_t num_rows_;
|
|
};
|
|
|
|
class DataTableAdapter : public detail::SingleBatchDataIter<DataTableAdapterBatch> {
|
|
public:
|
|
DataTableAdapter(void** data, const char** feature_stypes, std::size_t num_rows,
|
|
std::size_t num_features)
|
|
: batch_(data, feature_stypes, num_rows, num_features),
|
|
num_rows_(num_rows),
|
|
num_columns_(num_features) {}
|
|
const DataTableAdapterBatch& Value() const override { return batch_; }
|
|
std::size_t NumRows() const { return num_rows_; }
|
|
std::size_t NumColumns() const { return num_columns_; }
|
|
|
|
private:
|
|
DataTableAdapterBatch batch_;
|
|
std::size_t num_rows_;
|
|
std::size_t num_columns_;
|
|
};
|
|
|
|
class FileAdapterBatch {
|
|
public:
|
|
class Line {
|
|
public:
|
|
Line(size_t row_idx, const uint32_t *feature_idx, const float *value,
|
|
size_t size)
|
|
: row_idx_(row_idx),
|
|
feature_idx_(feature_idx),
|
|
value_(value),
|
|
size_(size) {}
|
|
|
|
size_t Size() { return size_; }
|
|
COOTuple GetElement(size_t idx) {
|
|
float fvalue = value_ == nullptr ? 1.0f : value_[idx];
|
|
return COOTuple{row_idx_, feature_idx_[idx], fvalue};
|
|
}
|
|
|
|
private:
|
|
size_t row_idx_;
|
|
const uint32_t* feature_idx_;
|
|
const float* value_;
|
|
size_t size_;
|
|
};
|
|
FileAdapterBatch(const dmlc::RowBlock<uint32_t>* block, size_t row_offset)
|
|
: block_(block), row_offset_(row_offset) {}
|
|
Line GetLine(size_t idx) const {
|
|
auto begin = block_->offset[idx];
|
|
auto end = block_->offset[idx + 1];
|
|
return Line{idx + row_offset_, &block_->index[begin], &block_->value[begin],
|
|
end - begin};
|
|
}
|
|
const float* Labels() const { return block_->label; }
|
|
const float* Weights() const { return block_->weight; }
|
|
const uint64_t* Qid() const { return block_->qid; }
|
|
const float* BaseMargin() const { return nullptr; }
|
|
|
|
size_t Size() const { return block_->size; }
|
|
static constexpr bool kIsRowMajor = true;
|
|
|
|
private:
|
|
const dmlc::RowBlock<uint32_t>* block_;
|
|
size_t row_offset_;
|
|
};
|
|
|
|
/** \brief FileAdapter wraps dmlc::parser to read files and provide access in a
|
|
* common interface. */
|
|
class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
|
|
public:
|
|
explicit FileAdapter(dmlc::Parser<uint32_t>* parser) : parser_(parser) {}
|
|
|
|
const FileAdapterBatch& Value() const override { return *batch_.get(); }
|
|
void BeforeFirst() override {
|
|
batch_.reset();
|
|
parser_->BeforeFirst();
|
|
row_offset_ = 0;
|
|
}
|
|
bool Next() override {
|
|
bool next = parser_->Next();
|
|
batch_.reset(new FileAdapterBatch(&parser_->Value(), row_offset_));
|
|
row_offset_ += parser_->Value().size;
|
|
return next;
|
|
}
|
|
// Indicates a number of rows/columns must be inferred
|
|
size_t NumRows() const { return kAdapterUnknownSize; }
|
|
size_t NumColumns() const { return kAdapterUnknownSize; }
|
|
|
|
private:
|
|
size_t row_offset_{0};
|
|
std::unique_ptr<FileAdapterBatch> batch_;
|
|
dmlc::Parser<uint32_t>* parser_;
|
|
};
|
|
|
|
/*! \brief Data iterator that takes callback to return data, used in JVM package for
|
|
* accepting data iterator. */
|
|
template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
|
|
class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
|
|
public:
|
|
IteratorAdapter(DataIterHandle data_handle, XGBCallbackDataIterNext* next_callback)
|
|
: columns_{data::kAdapterUnknownSize},
|
|
data_handle_(data_handle),
|
|
next_callback_(next_callback) {}
|
|
|
|
// override functions
|
|
void BeforeFirst() override {
|
|
CHECK(at_first_) << "Cannot reset IteratorAdapter";
|
|
}
|
|
|
|
bool Next() override {
|
|
if ((*next_callback_)(
|
|
data_handle_,
|
|
[](void *handle, XGBoostBatchCSR batch) -> int {
|
|
API_BEGIN();
|
|
static_cast<IteratorAdapter *>(handle)->SetData(batch);
|
|
API_END();
|
|
},
|
|
this) != 0) {
|
|
at_first_ = false;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
FileAdapterBatch const& Value() const override {
|
|
return *batch_.get();
|
|
}
|
|
|
|
// callback to set the data
|
|
void SetData(const XGBoostBatchCSR& batch) {
|
|
offset_.clear();
|
|
label_.clear();
|
|
weight_.clear();
|
|
index_.clear();
|
|
value_.clear();
|
|
offset_.insert(offset_.end(), batch.offset, batch.offset + batch.size + 1);
|
|
|
|
if (batch.label != nullptr) {
|
|
label_.insert(label_.end(), batch.label, batch.label + batch.size);
|
|
}
|
|
if (batch.weight != nullptr) {
|
|
weight_.insert(weight_.end(), batch.weight, batch.weight + batch.size);
|
|
}
|
|
if (batch.index != nullptr) {
|
|
index_.insert(index_.end(), batch.index + offset_[0],
|
|
batch.index + offset_.back());
|
|
}
|
|
if (batch.value != nullptr) {
|
|
value_.insert(value_.end(), batch.value + offset_[0],
|
|
batch.value + offset_.back());
|
|
}
|
|
if (offset_[0] != 0) {
|
|
size_t base = offset_[0];
|
|
for (size_t &item : offset_) {
|
|
item -= base;
|
|
}
|
|
}
|
|
CHECK(columns_ == data::kAdapterUnknownSize || columns_ == batch.columns)
|
|
<< "Number of columns between batches changed from " << columns_
|
|
<< " to " << batch.columns;
|
|
|
|
columns_ = batch.columns;
|
|
block_.size = batch.size;
|
|
|
|
block_.offset = dmlc::BeginPtr(offset_);
|
|
block_.label = dmlc::BeginPtr(label_);
|
|
block_.weight = dmlc::BeginPtr(weight_);
|
|
block_.qid = nullptr;
|
|
block_.field = nullptr;
|
|
block_.index = dmlc::BeginPtr(index_);
|
|
block_.value = dmlc::BeginPtr(value_);
|
|
|
|
batch_.reset(new FileAdapterBatch(&block_, row_offset_));
|
|
row_offset_ += offset_.size() - 1;
|
|
}
|
|
|
|
size_t NumColumns() const { return columns_; }
|
|
size_t NumRows() const { return kAdapterUnknownSize; }
|
|
|
|
private:
|
|
std::vector<size_t> offset_;
|
|
std::vector<dmlc::real_t> label_;
|
|
std::vector<dmlc::real_t> weight_;
|
|
std::vector<uint32_t> index_;
|
|
std::vector<dmlc::real_t> value_;
|
|
|
|
size_t columns_;
|
|
size_t row_offset_{0};
|
|
// at the beginning.
|
|
bool at_first_{true};
|
|
// handle to the iterator,
|
|
DataIterHandle data_handle_;
|
|
// call back to get the data.
|
|
XGBCallbackDataIterNext *next_callback_;
|
|
// internal Rowblock
|
|
dmlc::RowBlock<uint32_t> block_;
|
|
std::unique_ptr<FileAdapterBatch> batch_;
|
|
};
|
|
|
|
enum ColumnDType : uint8_t {
|
|
kUnknown,
|
|
kInt8,
|
|
kUInt8,
|
|
kInt16,
|
|
kUInt16,
|
|
kInt32,
|
|
kUInt32,
|
|
kInt64,
|
|
kUInt64,
|
|
kFloat,
|
|
kDouble
|
|
};
|
|
|
|
class Column {
|
|
public:
|
|
Column() = default;
|
|
|
|
Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
|
|
: col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
|
|
|
|
virtual ~Column() = default;
|
|
|
|
Column(const Column&) = delete;
|
|
Column& operator=(const Column&) = delete;
|
|
Column(Column&&) = delete;
|
|
Column& operator=(Column&&) = delete;
|
|
|
|
// whether the valid bit is set for this element
|
|
bool IsValid(size_t row_idx) const {
|
|
return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
|
|
}
|
|
|
|
virtual COOTuple GetElement(size_t row_idx) const = 0;
|
|
|
|
virtual bool IsValidElement(size_t row_idx) const = 0;
|
|
|
|
virtual std::vector<float> AsFloatVector() const = 0;
|
|
|
|
virtual std::vector<uint64_t> AsUint64Vector() const = 0;
|
|
|
|
size_t Length() const { return length_; }
|
|
|
|
protected:
|
|
size_t col_idx_;
|
|
size_t length_;
|
|
size_t null_count_;
|
|
const uint8_t* bitmap_;
|
|
};
|
|
|
|
// Only columns of primitive types are supported. An ArrowColumnarBatch is a
|
|
// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
|
|
// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
|
|
// derive from the abstract class Column.
|
|
template <typename T>
|
|
class PrimitiveColumn : public Column {
|
|
static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
|
|
|
|
public:
|
|
PrimitiveColumn(size_t idx, size_t length, size_t null_count,
|
|
const uint8_t* bitmap, const T* data, float missing)
|
|
: Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
|
|
|
|
COOTuple GetElement(size_t row_idx) const override {
|
|
CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
|
|
return { row_idx, col_idx_, IsValidElement(row_idx) ?
|
|
static_cast<float>(data_[row_idx]) : kNaN };
|
|
}
|
|
|
|
bool IsValidElement(size_t row_idx) const override {
|
|
// std::isfinite needs to cast to double to prevent msvc report error
|
|
return IsValid(row_idx)
|
|
&& std::isfinite(static_cast<double>(data_[row_idx]))
|
|
&& static_cast<float>(data_[row_idx]) != missing_;
|
|
}
|
|
|
|
std::vector<float> AsFloatVector() const override {
|
|
CHECK(data_) << "Column is empty";
|
|
std::vector<float> fv(length_);
|
|
std::transform(data_, data_ + length_, fv.begin(),
|
|
[](T v) { return static_cast<float>(v); });
|
|
return fv;
|
|
}
|
|
|
|
std::vector<uint64_t> AsUint64Vector() const override {
|
|
CHECK(data_) << "Column is empty";
|
|
std::vector<uint64_t> iv(length_);
|
|
std::transform(data_, data_ + length_, iv.begin(),
|
|
[](T v) { return static_cast<uint64_t>(v); });
|
|
return iv;
|
|
}
|
|
|
|
private:
|
|
const T* data_;
|
|
float missing_; // user specified missing value
|
|
};
|
|
|
|
struct ColumnarMetaInfo {
|
|
// data type of the column
|
|
ColumnDType type{ColumnDType::kUnknown};
|
|
// location of the column in an Arrow record batch
|
|
int64_t loc{-1};
|
|
};
|
|
|
|
struct ArrowSchemaImporter {
|
|
std::vector<ColumnarMetaInfo> columns;
|
|
|
|
// map Arrow format strings to types
|
|
static ColumnDType FormatMap(char const* format_str) {
|
|
CHECK(format_str) << "Format string cannot be empty";
|
|
switch (format_str[0]) {
|
|
case 'c':
|
|
return ColumnDType::kInt8;
|
|
case 'C':
|
|
return ColumnDType::kUInt8;
|
|
case 's':
|
|
return ColumnDType::kInt16;
|
|
case 'S':
|
|
return ColumnDType::kUInt16;
|
|
case 'i':
|
|
return ColumnDType::kInt32;
|
|
case 'I':
|
|
return ColumnDType::kUInt32;
|
|
case 'l':
|
|
return ColumnDType::kInt64;
|
|
case 'L':
|
|
return ColumnDType::kUInt64;
|
|
case 'f':
|
|
return ColumnDType::kFloat;
|
|
case 'g':
|
|
return ColumnDType::kDouble;
|
|
default:
|
|
CHECK(false) << "Column data type not supported by XGBoost";
|
|
return ColumnDType::kUnknown;
|
|
}
|
|
}
|
|
|
|
void Import(struct ArrowSchema *schema) {
|
|
if (schema) {
|
|
CHECK(std::string(schema->format) == "+s"); // NOLINT
|
|
CHECK(columns.empty());
|
|
for (auto i = 0; i < schema->n_children; ++i) {
|
|
std::string name{schema->children[i]->name};
|
|
ColumnDType type = FormatMap(schema->children[i]->format);
|
|
ColumnarMetaInfo col_info{type, i};
|
|
columns.push_back(col_info);
|
|
}
|
|
if (schema->release) {
|
|
schema->release(schema);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
class ArrowColumnarBatch {
|
|
public:
|
|
ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
|
|
: rb_{rb}, schema_{schema} {
|
|
CHECK(rb_) << "Cannot import non-existent record batch";
|
|
CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
|
|
}
|
|
|
|
size_t Import(float missing) {
|
|
auto& infov = schema_->columns;
|
|
for (size_t i = 0; i < infov.size(); ++i) {
|
|
columns_.push_back(CreateColumn(i, infov[i], missing));
|
|
}
|
|
|
|
// Compute the starting location for every row in this batch
|
|
auto batch_size = rb_->length;
|
|
auto num_columns = columns_.size();
|
|
row_offsets_.resize(batch_size + 1, 0);
|
|
for (auto i = 0; i < batch_size; ++i) {
|
|
row_offsets_[i+1] = row_offsets_[i];
|
|
for (size_t j = 0; j < num_columns; ++j) {
|
|
if (GetColumn(j).IsValidElement(i)) {
|
|
row_offsets_[i+1]++;
|
|
}
|
|
}
|
|
}
|
|
// return number of elements in the batch
|
|
return row_offsets_.back();
|
|
}
|
|
|
|
ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
|
|
ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
|
|
ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
|
|
ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
|
|
|
|
virtual ~ArrowColumnarBatch() {
|
|
if (rb_ && rb_->release) {
|
|
rb_->release(rb_);
|
|
rb_ = nullptr;
|
|
}
|
|
columns_.clear();
|
|
}
|
|
|
|
size_t Size() const { return rb_ ? rb_->length : 0; }
|
|
|
|
size_t NumColumns() const { return columns_.size(); }
|
|
|
|
size_t NumElements() const { return row_offsets_.back(); }
|
|
|
|
const Column& GetColumn(size_t col_idx) const {
|
|
return *columns_[col_idx];
|
|
}
|
|
|
|
void ShiftRowOffsets(size_t batch_offset) {
|
|
std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
|
|
[=](size_t c) { return c + batch_offset; });
|
|
}
|
|
|
|
const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
|
|
|
|
private:
|
|
std::shared_ptr<Column> CreateColumn(size_t idx,
|
|
ColumnarMetaInfo info,
|
|
float missing) const {
|
|
if (info.loc < 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
auto loc_in_batch = info.loc;
|
|
auto length = rb_->length;
|
|
auto null_count = rb_->null_count;
|
|
auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
|
|
auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
|
|
const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
|
|
const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
|
|
|
|
// if null_count is not computed, compute it here
|
|
if (null_count < 0) {
|
|
if (!bitmap) {
|
|
null_count = 0;
|
|
} else {
|
|
null_count = length;
|
|
for (auto i = 0; i < length; ++i) {
|
|
if (bitmap[i/8] & (1 << (i%8))) {
|
|
null_count--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (info.type) {
|
|
case ColumnDType::kInt8:
|
|
return std::make_shared<PrimitiveColumn<int8_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const int8_t*>(data), missing);
|
|
case ColumnDType::kUInt8:
|
|
return std::make_shared<PrimitiveColumn<uint8_t>>(
|
|
idx, length, null_count, bitmap, data, missing);
|
|
case ColumnDType::kInt16:
|
|
return std::make_shared<PrimitiveColumn<int16_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const int16_t*>(data), missing);
|
|
case ColumnDType::kUInt16:
|
|
return std::make_shared<PrimitiveColumn<uint16_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const uint16_t*>(data), missing);
|
|
case ColumnDType::kInt32:
|
|
return std::make_shared<PrimitiveColumn<int32_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const int32_t*>(data), missing);
|
|
case ColumnDType::kUInt32:
|
|
return std::make_shared<PrimitiveColumn<uint32_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const uint32_t*>(data), missing);
|
|
case ColumnDType::kInt64:
|
|
return std::make_shared<PrimitiveColumn<int64_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const int64_t*>(data), missing);
|
|
case ColumnDType::kUInt64:
|
|
return std::make_shared<PrimitiveColumn<uint64_t>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const uint64_t*>(data), missing);
|
|
case ColumnDType::kFloat:
|
|
return std::make_shared<PrimitiveColumn<float>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const float*>(data), missing);
|
|
case ColumnDType::kDouble:
|
|
return std::make_shared<PrimitiveColumn<double>>(
|
|
idx, length, null_count, bitmap,
|
|
reinterpret_cast<const double*>(data), missing);
|
|
default:
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
struct ArrowArray* rb_;
|
|
struct ArrowSchemaImporter* schema_;
|
|
std::vector<std::shared_ptr<Column>> columns_;
|
|
std::vector<size_t> row_offsets_;
|
|
};
|
|
|
|
using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
|
|
class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
|
|
public:
|
|
RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
|
|
: next_callback_{next_callback}, nbatches_{nbatch} {}
|
|
|
|
void BeforeFirst() override {
|
|
CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
|
|
}
|
|
|
|
bool Next() override {
|
|
batches_.clear();
|
|
while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
|
|
at_first_ = false;
|
|
}
|
|
|
|
if (batches_.size() > 0) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
|
|
// Schema is only imported once at the beginning, regardless how many
|
|
// baches are comming.
|
|
// But even schema is not imported we still need to release its C data
|
|
// exported from Arrow.
|
|
if (at_first_ && schema) {
|
|
schema_.Import(schema);
|
|
} else {
|
|
if (schema && schema->release) {
|
|
schema->release(schema);
|
|
}
|
|
}
|
|
if (rb) {
|
|
batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
|
|
}
|
|
}
|
|
|
|
const ArrowColumnarBatchVec& Value() const override {
|
|
return batches_;
|
|
}
|
|
|
|
size_t NumColumns() const { return schema_.columns.size(); }
|
|
size_t NumRows() const { return kAdapterUnknownSize; }
|
|
|
|
private:
|
|
XGDMatrixCallbackNext *next_callback_;
|
|
bool at_first_{true};
|
|
int nbatches_;
|
|
struct ArrowSchemaImporter schema_;
|
|
ArrowColumnarBatchVec batches_;
|
|
};
|
|
|
|
class SparsePageAdapterBatch {
|
|
HostSparsePageView page_;
|
|
|
|
public:
|
|
struct Line {
|
|
Entry const* inst;
|
|
size_t n;
|
|
bst_row_t ridx;
|
|
COOTuple GetElement(size_t idx) const { return {ridx, inst[idx].index, inst[idx].fvalue}; }
|
|
size_t Size() const { return n; }
|
|
};
|
|
|
|
explicit SparsePageAdapterBatch(HostSparsePageView page) : page_{std::move(page)} {}
|
|
Line GetLine(size_t ridx) const { return Line{page_[ridx].data(), page_[ridx].size(), ridx}; }
|
|
size_t Size() const { return page_.Size(); }
|
|
};
|
|
}; // namespace data
|
|
} // namespace xgboost
|
|
#endif // XGBOOST_DATA_ADAPTER_H_
|