Support dmatrix construction from cupy array (#5206)

This commit is contained in:
Rory Mitchell
2020-01-22 13:15:27 +13:00
committed by GitHub
parent 2a071cebc5
commit 9c56480c61
19 changed files with 522 additions and 158 deletions

View File

@@ -1,14 +1,15 @@
/*!
* Copyright 2019 by Contributors
* \file columnar.h
* \file array_interface.h
* \brief Basic structure holding a reference to arrow columnar data format.
*/
#ifndef XGBOOST_DATA_COLUMNAR_H_
#define XGBOOST_DATA_COLUMNAR_H_
#ifndef XGBOOST_DATA_ARRAY_INTERFACE_H_
#define XGBOOST_DATA_ARRAY_INTERFACE_H_
#include <cinttypes>
#include <map>
#include <string>
#include <utility>
#include "xgboost/data.h"
#include "xgboost/json.h"
@@ -18,7 +19,7 @@
namespace xgboost {
// Common errors in parsing columnar format.
struct ColumnarErrors {
struct ArrayInterfaceErrors {
static char const* Contigious() {
return "Memory should be contigious.";
}
@@ -119,15 +120,12 @@ class ArrayInterfaceHandler {
if (array.find("version") == array.cend()) {
LOG(FATAL) << "Missing `version' field for array interface";
}
auto version = get<Integer const>(array.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();
if (array.find("typestr") == array.cend()) {
LOG(FATAL) << "Missing `typestr' field for array interface";
}
auto typestr = get<String const>(array.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
CHECK_EQ(typestr.size(), 3) << ArrayInterfaceErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ArrayInterfaceErrors::BigEndian();
if (array.find("shape") == array.cend()) {
LOG(FATAL) << "Missing `shape' field for array interface";
@@ -149,7 +147,7 @@ class ArrayInterfaceHandler {
auto p_mask = GetPtrFromArrayData<RBitField8::value_type*>(j_mask);
auto j_shape = get<Array const>(j_mask.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(j_shape.size(), 1) << ArrayInterfaceErrors::Dimension(1);
auto typestr = get<String const>(j_mask.at("typestr"));
// For now this is just 1, we can support different size of interger in mask.
int64_t const type_length = typestr.at(2) - 48;
@@ -178,8 +176,8 @@ class ArrayInterfaceHandler {
if (j_mask.find("strides") != j_mask.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
CHECK_EQ(strides.size(), 1) << ArrayInterfaceErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ArrayInterfaceErrors::Contigious();
}
s_mask = {p_mask, span_size};
@@ -188,18 +186,28 @@ class ArrayInterfaceHandler {
return 0;
}
static size_t ExtractLength(std::map<std::string, Json> const& column) {
static std::pair<size_t, size_t> ExtractShape(
std::map<std::string, Json> const& column) {
auto j_shape = get<Array const>(column.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
auto typestr = get<String const>(column.at("typestr"));
if (column.find("strides") != column.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), typestr.at(2) - '0')
<< ColumnarErrors::Contigious();
if (!IsA<Null>(column.at("strides"))) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), j_shape.size())
<< ArrayInterfaceErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), typestr.at(2) - '0')
<< ArrayInterfaceErrors::Contigious();
}
}
return static_cast<size_t>(get<Integer const>(j_shape.at(0)));
if (j_shape.size() == 1) {
return {static_cast<size_t>(get<Integer const>(j_shape.at(0))), 1};
} else {
CHECK_EQ(j_shape.size(), 2)
<< "Only 1D or 2-D arrays currently supported.";
return {static_cast<size_t>(get<Integer const>(j_shape.at(0))),
static_cast<size_t>(get<Integer const>(j_shape.at(1)))};
}
}
template <typename T>
static common::Span<T> ExtractData(std::map<std::string, Json> const& column) {
@@ -212,25 +220,27 @@ class ArrayInterfaceHandler {
<< "Input data type and typestr mismatch. typestr: " << typestr;
auto length = ExtractLength(column);
auto shape = ExtractShape(column);
T* p_data = ArrayInterfaceHandler::GetPtrFromArrayData<T*>(column);
return common::Span<T>{p_data, length};
return common::Span<T>{p_data, shape.first * shape.second};
}
};
// A view over __array_interface__
class Columnar {
class ArrayInterface {
using mask_type = unsigned char;
using index_type = int32_t;
public:
Columnar() = default;
explicit Columnar(std::map<std::string, Json> const& column) {
ArrayInterface() = default;
explicit ArrayInterface(std::map<std::string, Json> const& column) {
ArrayInterfaceHandler::Validate(column);
data = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(column);
CHECK(data) << "Column is null";
size = ArrayInterfaceHandler::ExtractLength(column);
auto shape = ArrayInterfaceHandler::ExtractShape(column);
num_rows = shape.first;
num_cols = shape.second;
common::Span<RBitField8::value_type> s_mask;
size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);
@@ -238,7 +248,7 @@ class Columnar {
valid = RBitField8(s_mask);
if (s_mask.data()) {
CHECK_EQ(n_bits, size)
CHECK_EQ(n_bits, num_rows)
<< "Shape of bit mask doesn't match data shape. "
<< "XGBoost doesn't support internal broadcasting.";
}
@@ -271,7 +281,7 @@ class Columnar {
} else if (type[1] == 'u' && type[2] == '8') {
return;
} else {
LOG(FATAL) << ColumnarErrors::UnSupportedType(type);
LOG(FATAL) << ArrayInterfaceErrors::UnSupportedType(type);
return;
}
}
@@ -304,10 +314,11 @@ class Columnar {
}
RBitField8 valid;
int32_t size;
int32_t num_rows;
int32_t num_cols;
void* data;
char type[3];
};
} // namespace xgboost
#endif // XGBOOST_DATA_COLUMNAR_H_
#endif // XGBOOST_DATA_ARRAY_INTERFACE_H_

View File

@@ -7,14 +7,14 @@
#include "xgboost/data.h"
#include "xgboost/logging.h"
#include "xgboost/json.h"
#include "columnar.h"
#include "array_interface.h"
#include "../common/device_helpers.cuh"
#include "device_adapter.cuh"
#include "simple_dmatrix.h"
namespace xgboost {
void CopyInfoImpl(std::map<std::string, Json> const& column, HostDeviceVector<float>* out) {
void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
auto SetDeviceToPtr = [](void* ptr) {
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
@@ -22,43 +22,42 @@ void CopyInfoImpl(std::map<std::string, Json> const& column, HostDeviceVector<fl
dh::safe_cuda(cudaSetDevice(ptr_device));
return ptr_device;
};
Columnar foreign_column(column);
auto ptr_device = SetDeviceToPtr(foreign_column.data);
auto ptr_device = SetDeviceToPtr(column.data);
out->SetDevice(ptr_device);
out->Resize(foreign_column.size);
out->Resize(column.num_rows);
auto p_dst = thrust::device_pointer_cast(out->DevicePointer());
dh::LaunchN(ptr_device, foreign_column.size, [=] __device__(size_t idx) {
p_dst[idx] = foreign_column.GetElement(idx);
dh::LaunchN(ptr_device, column.num_rows, [=] __device__(size_t idx) {
p_dst[idx] = column.GetElement(idx);
});
}
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr = get<Array>(j_interface);
CHECK_EQ(j_arr.size(), 1) << "MetaInfo: " << c_key << ". " << ColumnarErrors::Dimension(1);;
auto const& j_arr_obj = get<Object const>(j_arr[0]);
std::string key {c_key};
ArrayInterfaceHandler::Validate(j_arr_obj);
if (j_arr_obj.find("mask") != j_arr_obj.cend()) {
LOG(FATAL) << "Meta info " << key << " should be dense, found validity mask";
}
auto const& typestr = get<String const>(j_arr_obj.at("typestr"));
CHECK_EQ(j_arr.size(), 1)
<< "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1);
ArrayInterface array_interface(get<Object const>(j_arr[0]));
std::string key{c_key};
CHECK(!array_interface.valid.Data())
<< "Meta info " << key << " should be dense, found validity mask";
CHECK_EQ(array_interface.num_cols, 1)
<< "Meta info should be a single column.";
if (key == "label") {
CopyInfoImpl(j_arr_obj, &labels_);
CopyInfoImpl(array_interface, &labels_);
} else if (key == "weight") {
CopyInfoImpl(j_arr_obj, &weights_);
CopyInfoImpl(array_interface, &weights_);
} else if (key == "base_margin") {
CopyInfoImpl(j_arr_obj, &base_margin_);
CopyInfoImpl(array_interface, &base_margin_);
} else if (key == "group") {
// Ranking is not performed on device.
auto s_data = ArrayInterfaceHandler::ExtractData<uint32_t>(j_arr_obj);
thrust::device_ptr<uint32_t> p_src {s_data.data()};
thrust::device_ptr<uint32_t> p_src{
reinterpret_cast<uint32_t*>(array_interface.data)};
auto length = s_data.size();
auto length = array_interface.num_rows;
group_ptr_.resize(length + 1);
group_ptr_[0] = 0;
thrust::copy(p_src, p_src + length, group_ptr_.begin() + 1);
@@ -82,4 +81,7 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
template DMatrix* DMatrix::Create<data::CudfAdapter>(
data::CudfAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size);
template DMatrix* DMatrix::Create<data::CupyAdapter>(
data::CupyAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size);
} // namespace xgboost

View File

@@ -7,9 +7,9 @@
#include <limits>
#include <memory>
#include <string>
#include "columnar.h"
#include "adapter.h"
#include "../common/device_helpers.cuh"
#include "adapter.h"
#include "array_interface.h"
namespace xgboost {
namespace data {
@@ -17,12 +17,13 @@ namespace data {
class CudfAdapterBatch : public detail::NoMetaInfo {
public:
CudfAdapterBatch() = default;
CudfAdapterBatch(common::Span<Columnar> columns,
CudfAdapterBatch(common::Span<ArrayInterface> columns,
common::Span<size_t> column_ptr, size_t num_elements)
: columns_(columns),column_ptr_(column_ptr), num_elements(num_elements) {}
size_t Size()const { return num_elements; }
__device__ COOTuple GetElement(size_t idx)const
{
: columns_(columns),
column_ptr_(column_ptr),
num_elements(num_elements) {}
size_t Size() const { return num_elements; }
__device__ COOTuple GetElement(size_t idx) const {
size_t column_idx =
dh::UpperBound(column_ptr_.data(), column_ptr_.size(), idx) - 1;
auto& column = columns_[column_idx];
@@ -34,22 +35,23 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
}
private:
common::Span<Columnar> columns_;
common::Span<ArrayInterface> columns_;
common::Span<size_t> column_ptr_;
size_t num_elements;
};
/*!
* Please be careful that, in official specification, the only three required fields are
* `shape', `version' and `typestr'. Any other is optional, including `data'. But here
* we have one additional requirements for input data:
* Please be careful that, in official specification, the only three required
* fields are `shape', `version' and `typestr'. Any other is optional,
* including `data'. But here we have one additional requirements for input
* data:
*
* - `data' field is required, passing in an empty dataset is not accepted, as most (if
* not all) of our algorithms don't have test for empty dataset. An error is better
* than a crash.
* - `data' field is required, passing in an empty dataset is not accepted, as
* most (if not all) of our algorithms don't have test for empty dataset. An
* error is better than a crash.
*
* What if invalid value from dataframe is 0 but I specify missing=NaN in XGBoost? Since
* validity mask is ignored, all 0s are preserved in XGBoost.
* What if invalid value from dataframe is 0 but I specify missing=NaN in
* XGBoost? Since validity mask is ignored, all 0s are preserved in XGBoost.
*
* FIXME(trivialfis): Put above into document after we have a consistent way for
* processing input data.
@@ -96,23 +98,23 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
CHECK_GT(n_columns, 0) << "Number of columns must not equal to 0.";
auto const& typestr = get<String const>(json_columns[0]["typestr"]);
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
std::vector<Columnar> columns;
CHECK_EQ(typestr.size(), 3) << ArrayInterfaceErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ArrayInterfaceErrors::BigEndian();
std::vector<ArrayInterface> columns;
std::vector<size_t> column_ptr({0});
auto first_column = Columnar(get<Object const>(json_columns[0]));
auto first_column = ArrayInterface(get<Object const>(json_columns[0]));
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, -1);
dh::safe_cuda(cudaSetDevice(device_idx_));
num_rows_ = first_column.size;
num_rows_ = first_column.num_rows;
for (auto& json_col : json_columns) {
auto column = Columnar(get<Object const>(json_col));
auto column = ArrayInterface(get<Object const>(json_col));
columns.push_back(column);
column_ptr.emplace_back(column_ptr.back() + column.size);
num_rows_ = std::max(num_rows_, size_t(column.size));
column_ptr.emplace_back(column_ptr.back() + column.num_rows);
num_rows_ = std::max(num_rows_, size_t(column.num_rows));
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
<< "All columns should use the same device.";
CHECK_EQ(num_rows_, column.size)
CHECK_EQ(num_rows_, column.num_rows)
<< "All columns should have same number of rows.";
}
columns_ = columns;
@@ -124,19 +126,65 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return columns_.size(); }
size_t DeviceIdx()const {
return device_idx_;
}
size_t DeviceIdx() const { return device_idx_; }
// Cudf is column major
bool IsRowMajor() { return false; }
private:
CudfAdapterBatch batch;
dh::device_vector<Columnar> columns_;
dh::device_vector<ArrayInterface> columns_;
dh::device_vector<size_t> column_ptr_; // Exclusive scan of column sizes
size_t num_rows_{0};
int device_idx_;
};
class CupyAdapterBatch : public detail::NoMetaInfo {
public:
CupyAdapterBatch() = default;
CupyAdapterBatch(ArrayInterface array_interface)
: array_interface_(array_interface) {}
size_t Size() const {
return array_interface_.num_rows * array_interface_.num_cols;
}
__device__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % array_interface_.num_cols;
size_t row_idx = idx / array_interface_.num_cols;
float value = array_interface_.valid.Data() == nullptr ||
array_interface_.valid.Check(row_idx)
? array_interface_.GetElement(idx)
: std::numeric_limits<float>::quiet_NaN();
return COOTuple(row_idx, column_idx, value);
}
private:
ArrayInterface array_interface_;
};
class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
public:
explicit CupyAdapter(std::string cuda_interface_str) {
Json json_array_interface =
Json::Load({cuda_interface_str.c_str(), cuda_interface_str.size()});
array_interface = ArrayInterface(get<Object const>(json_array_interface));
device_idx_ = dh::CudaGetPointerDevice(array_interface.data);
CHECK_NE(device_idx_, -1);
batch = CupyAdapterBatch(array_interface);
}
const CupyAdapterBatch& Value() const override { return batch; }
size_t NumRows() const { return array_interface.num_rows; }
size_t NumColumns() const { return array_interface.num_cols; }
size_t DeviceIdx() const { return device_idx_; }
bool IsRowMajor() { return true; }
private:
ArrayInterface array_interface;
CupyAdapterBatch batch;
int device_idx_;
};
}; // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_DEVICE_ADAPTER_H_

View File

@@ -7,7 +7,6 @@
#include <xgboost/json.h>
#include "simple_csr_source.h"
#include "columnar.h"
namespace xgboost {
namespace data {

View File

@@ -78,6 +78,35 @@ void CopyDataColumnMajor(AdapterT* adapter, common::Span<Entry> data,
}
}
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
explicit IsValidFunctor(float missing) : missing(missing) {}
float missing;
__device__ bool operator()(const Entry& x) const {
return IsValid(x.fvalue, missing);
}
};
// Here the data is already correctly ordered and simply needs to be compacted
// to remove missing data
template <typename AdapterT>
void CopyDataRowMajor(AdapterT* adapter, common::Span<Entry> data,
int device_idx, float missing,
common::Span<size_t> row_ptr) {
auto& batch = adapter->Value();
auto transform_f = [=] __device__(size_t idx) {
const auto& e = batch.GetElement(idx);
return Entry(e.column_idx, e.value);
}; // NOLINT
auto counting = thrust::make_counting_iterator(0llu);
thrust::transform_iterator<decltype(transform_f), decltype(counting), Entry>
transform_iter(counting, transform_f);
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::copy_if(
thrust::cuda::par(alloc), transform_iter, transform_iter + batch.Size(),
thrust::device_pointer_cast(data.data()), IsValidFunctor(missing));
}
// Does not currently support metainfo as no on-device data source contains this
// Current implementation assumes a single batch. More batches can
// be supported in future. Does not currently support inferring row/column size
@@ -102,11 +131,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
mat.info.num_nonzero_ = mat.page_.offset.HostVector().back();
mat.page_.data.Resize(mat.info.num_nonzero_);
if (adapter->IsRowMajor()) {
LOG(FATAL) << "Not implemented.";
CopyDataRowMajor(adapter, mat.page_.data.DeviceSpan(),
adapter->DeviceIdx(), missing, s_offset);
} else {
CopyDataColumnMajor(adapter, mat.page_.data.DeviceSpan(),
adapter->DeviceIdx(), missing, s_offset);
}
// Sync
mat.page_.data.HostVector();
mat.info.num_col_ = adapter->NumColumns();
mat.info.num_row_ = adapter->NumRows();
@@ -116,5 +148,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
int nthread);
} // namespace data
} // namespace xgboost