Cudf support. (#4745)

* Initial support for cudf integration.

* Add two C APIs for consuming data and metainfo.

* Add CopyFrom for SimpleCSRSource as a generic function to consume the data.

* Add FromDeviceColumnar for consuming device data.

* Add new MetaInfo::SetInfo for consuming label, weight etc.
This commit is contained in:
Jiaming Yuan
2019-08-19 00:51:40 -04:00
committed by Rory Mitchell
parent ab357dd41c
commit 9700776597
26 changed files with 1385 additions and 287 deletions

82
src/data/columnar.h Normal file
View File

@@ -0,0 +1,82 @@
/*!
* Copyright 2019 by Contributors
* \file columnar.h
* \brief Basic structure holding a reference to arrow columnar data format.
*/
#ifndef XGBOOST_DATA_COLUMNAR_H_
#define XGBOOST_DATA_COLUMNAR_H_
#include <cinttypes>
#include <map>
#include <string>
#include "xgboost/data.h"
#include "xgboost/json.h"
#include "../common/span.h"
#include "../common/bitfield.h"
namespace xgboost {
struct Columnar {
using mask_type = unsigned char;
using index_type = int32_t;
common::Span<float> data;
RBitField8 valid;
int32_t size;
int32_t null_count;
};
// Common errors in parsing columnar format.
struct ColumnarErrors {
static char const* Contigious() {
return "Memory should be contigious.";
}
static char const* TypestrFormat() {
return "`typestr` should be of format <endian><type><size>.";
}
// Not supported in Apache Arrow.
static char const* BigEndian() {
return "Big endian is not supported.";
}
static char const* Dimension(int32_t d) {
static std::string str;
str.clear();
str += "Only ";
str += std::to_string(d);
str += " dimensional array is valid.";
return str.c_str();
}
static char const* Version() {
return "Only version 1 of __cuda_array_interface__ is being supported.";
}
static char const* toFloat() {
return "Please convert the input into float32 first.";
}
static char const* toUInt() {
return "Please convert the Group into unsigned 32 bit integers first.";
}
static char const* ofType(std::string type) {
static std::string str;
str.clear();
str += " should be of ";
str += type;
str += " type.";
return str.c_str();
}
};
template <typename PtrType>
PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
if (obj.find("data") == obj.cend()) {
LOG(FATAL) << "Empty data passed in.";
}
auto p_data = reinterpret_cast<PtrType>(static_cast<size_t>(
get<Integer const>(
get<Array const>(
obj.at("data"))
.at(0))));
return p_data;
}
} // namespace xgboost
#endif // XGBOOST_DATA_COLUMNAR_H_

View File

@@ -1,11 +1,12 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2019 by Contributors
* \file data.cc
*/
#include <xgboost/data.h>
#include <xgboost/logging.h>
#include <dmlc/registry.h>
#include <cstring>
#include "./sparse_page_writer.h"
#include "./simple_dmatrix.h"
#include "./simple_csr_source.h"
@@ -110,7 +111,6 @@ inline bool MetaTryLoadFloatInfo(const std::string& fname,
default: LOG(FATAL) << "Unknown data type" << dtype; \
} \
void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
if (!std::strcmp(key, "root_index")) {
root_index_.resize(num);
@@ -139,9 +139,17 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
for (size_t i = 1; i < group_ptr_.size(); ++i) {
group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
}
} else {
LOG(FATAL) << "Unknown metainfo: " << key;
}
}
#if !defined(XGBOOST_USE_CUDA)
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
LOG(FATAL) << "XGBoost version is not compiled with GPU support";
}
#endif // !defined(XGBOOST_USE_CUDA)
DMatrix* DMatrix::Load(const std::string& uri,
bool silent,
bool load_row_split,

86
src/data/data.cu Normal file
View File

@@ -0,0 +1,86 @@
/*!
* Copyright 2019 by XGBoost Contributors
*
* \file data.cu
*/
#include "xgboost/data.h"
#include "xgboost/logging.h"
#include "xgboost/json.h"
#include "columnar.h"
#include "../common/device_helpers.cuh"
namespace xgboost {
void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
Json j_arr = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr_obj = get<Object>(j_arr);
std::string key {c_key};
auto version = get<Integer const>(j_arr_obj.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();
if (j_arr_obj.find("mask") != j_arr_obj.cend()) {
LOG(FATAL) << "Meta info " << key << " should be dense, found validity mask";
}
auto typestr = get<String const>(j_arr_obj.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
auto j_shape = get<Array const>(j_arr_obj.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
auto length = get<Integer const>(j_shape.at(0));
CHECK_GT(length, 0) << "Label set cannot be empty.";
if (j_arr_obj.find("strides") != j_arr_obj.cend()) {
auto strides = get<Array const>(j_arr_obj.at("strides"));
CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
}
float* p_data = GetPtrFromArrayData<float*>(j_arr_obj);
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, p_data));
int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device));
thrust::device_ptr<float> p_src {p_data};
HostDeviceVector<float>* dst;
if (key == "root_index") {
LOG(FATAL) << "root index for columnar data is not supported.";
} else if (key == "label") {
dst = &labels_;
CHECK_EQ(typestr.at(1), 'f') << "Label"
<< ColumnarErrors::ofType("floating point");
CHECK_EQ(typestr.at(2), '4') << ColumnarErrors::toFloat();
} else if (key == "weight") {
dst = &weights_;
CHECK_EQ(typestr.at(1), 'f') << "Weight"
<< ColumnarErrors::ofType("floating point");;
CHECK_EQ(typestr.at(2), '4') << ColumnarErrors::toFloat();
} else if (key == "base_margin") {
dst = &base_margin_;
CHECK_EQ(typestr.at(1), 'f') << "Base Margin"
<< ColumnarErrors::ofType("floating point");
CHECK_EQ(typestr.at(2), '4') << ColumnarErrors::toFloat();
} else if (key == "group") {
CHECK_EQ(typestr.at(1), 'u') << "Group"
<< ColumnarErrors::ofType("unsigned 32 bit integers");
CHECK_EQ(typestr.at(2), '4') << ColumnarErrors::toUInt();
group_ptr_.resize(length + 1);
group_ptr_[0] = 0;
// Ranking is not performed on device.
thrust::copy(p_src, p_src + length, group_ptr_.begin() + 1);
for (size_t i = 1; i < group_ptr_.size(); ++i) {
group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
}
return;
} else {
LOG(FATAL) << "Unknown metainfo: " << key;
}
dst->Reshard(GPUDistribution(GPUSet::Range(ptr_device, 1)));
dst->Resize(length);
auto p_dst = thrust::device_pointer_cast(dst->DevicePointer(0));
thrust::copy(p_src, p_src + length, p_dst);
}
} // namespace xgboost

View File

@@ -1,11 +1,14 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2019 by Contributors
* \file simple_csr_source.cc
*/
#include <dmlc/base.h>
#include <xgboost/logging.h>
#include <xgboost/json.h>
#include <limits>
#include "./simple_csr_source.h"
#include "simple_csr_source.h"
#include "columnar.h"
namespace xgboost {
namespace data {
@@ -117,5 +120,143 @@ const SparsePage& SimpleCSRSource::Value() const {
return page_;
}
/*!
* Please be careful that, in official specification, the only three required fields are
* `shape', `version' and `typestr'. Any other is optional, including `data'. But here
* we have two additional requirements for input data:
*
* - `data' field is required, passing in an empty dataset is not accepted, as most (if
* not all) of our algorithms don't have test for empty dataset. An error is better
* than a crash.
*
* - `null_count' is required when `mask' is presented. We can compute `null_count'
* ourselves and copy the result back to host for memory allocation. But it's in the
* specification of Apache Arrow hence it should be readily available,
*
* Sample input:
* [
* {
* "shape": [
* 10
* ],
* "strides": [
* 4
* ],
* "data": [
* 30074864128,
* false
* ],
* "typestr": "<f4",
* "version": 1,
* "mask": {
* "shape": [
* 64
* ],
* "strides": [
* 1
* ],
* "data": [
* 30074864640,
* false
* ],
* "typestr": "|i1",
* "version": 1,
* "null_count": 1
* }
* }
* ]
*/
void SimpleCSRSource::CopyFrom(std::string const& cuda_interfaces_str) {
Json interfaces = Json::Load({cuda_interfaces_str.c_str(),
cuda_interfaces_str.size()});
std::vector<Json> const& columns = get<Array>(interfaces);
size_t n_columns = columns.size();
CHECK_GT(n_columns, 0);
std::vector<Columnar> foreign_cols(n_columns);
for (size_t i = 0; i < columns.size(); ++i) {
CHECK(IsA<Object>(columns[i]));
auto const& column = get<Object const>(columns[i]);
auto version = get<Integer const>(column.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();
// Find null mask (validity mask) field
// Mask object is also an array interface, but with different requirements.
// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
common::Span<RBitField8::value_type> s_mask;
int32_t null_count {0};
if (column.find("mask") != column.cend()) {
auto const& j_mask = get<Object const>(column.at("mask"));
auto p_mask = GetPtrFromArrayData<RBitField8::value_type*>(j_mask);
auto j_shape = get<Array const>(j_mask.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
"Length of validity map must be a multiple of 8 bytes.";
int64_t size = get<Integer>(j_shape.at(0)) *
sizeof(unsigned char) / sizeof(RBitField8::value_type);
s_mask = {p_mask, size};
auto typestr = get<String const>(j_mask.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
CHECK_EQ(typestr.at(1), 'i') << "mask" << ColumnarErrors::ofType("unsigned char");
CHECK_EQ(typestr.at(2), '1') << "mask" << ColumnarErrors::toUInt();
CHECK(j_mask.find("null_count") != j_mask.cend()) <<
"Column with null mask must include null_count as "
"part of mask object for XGBoost.";
null_count = get<Integer const>(j_mask.at("null_count"));
}
// Find data field
if (column.find("data") == column.cend()) {
LOG(FATAL) << "Empty dataset passed in.";
}
auto typestr = get<String const>(column.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
CHECK_EQ(typestr.at(1), 'f') << "data" << ColumnarErrors::ofType("floating point");
CHECK_EQ(typestr.at(2), '4') << ColumnarErrors::toFloat();
auto j_shape = get<Array const>(column.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
if (column.find("strides") != column.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
}
auto length = get<Integer const>(j_shape.at(0));
float* p_data = GetPtrFromArrayData<float*>(column);
common::Span<float> s_data {p_data, length};
foreign_cols[i].data = s_data;
foreign_cols[i].valid = RBitField8(s_mask);
foreign_cols[i].size = s_data.size();
foreign_cols[i].null_count = null_count;
}
info.num_col_ = n_columns;
info.num_row_ = foreign_cols[0].size;
for (size_t i = 0; i < n_columns; ++i) {
CHECK_EQ(foreign_cols[0].size, foreign_cols[i].size);
info.num_nonzero_ += foreign_cols[i].data.size() - foreign_cols[i].null_count;
}
this->FromDeviceColumnar(foreign_cols);
}
#if !defined(XGBOOST_USE_CUDA)
void SimpleCSRSource::FromDeviceColumnar(std::vector<Columnar> cols) {
LOG(FATAL) << "XGBoost version is not compiled with GPU support";
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data
} // namespace xgboost

View File

@@ -0,0 +1,117 @@
/*!
* Copyright 2019 by XGBoost Contributors
*
* \file simple_csr_source.cuh
* \brief An extension for the simple CSR source in-memory data structure to accept
* foreign columnar.
*/
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
#include <xgboost/base.h>
#include <xgboost/data.h>
#include <vector>
#include <algorithm>
#include "simple_csr_source.h"
#include "columnar.h"
#include "../common/bitfield.h"
#include "../common/device_helpers.cuh"
namespace xgboost {
namespace data {
template <size_t kBlockThreads>
__global__ void CountValidKernel(common::Span<Columnar const> columns,
int32_t const n_rows,
common::Span<size_t> offsets) {
// One block for a column
auto const bid = blockIdx.x;
auto const tid = threadIdx.x;
if (bid >= columns.size()) {
return;
}
RBitField8 const mask = columns[bid].valid;
for (auto r = tid; r < n_rows; r += kBlockThreads) {
if (mask.Data() == nullptr || mask.Check(r)) {
atomicAdd(reinterpret_cast<BitFieldAtomicType*>(&offsets[r+1]),
static_cast<BitFieldAtomicType>(1));
}
}
}
__global__ void CreateCSRKernel(Columnar const column,
int32_t colid,
common::Span<size_t> offsets,
common::Span<Entry> out_data) {
auto tid = threadIdx.x + blockDim.x * blockIdx.x;
if (column.size <= tid) {
return;
}
if (column.valid.Data() == nullptr || column.valid.Check(tid)) {
int32_t oid = offsets[tid];
out_data[oid].fvalue = column.data[tid];
out_data[oid].index = colid;
offsets[tid] += 1;
}
}
void SimpleCSRSource::FromDeviceColumnar(std::vector<Columnar> cols) {
uint64_t const n_cols = cols.size();
uint64_t const n_rows = cols[0].size;
auto ptr = cols[0].data.data();
int32_t device = dh::CudaGetPointerDevice(ptr);
CHECK_NE(device, -1);
for (int32_t i = 1; i < n_cols; ++i) {
auto ptr = cols[i].data.data();
int32_t ptr_device = dh::CudaGetPointerDevice(ptr);
CHECK_EQ(device, ptr_device)
<< "GPU ID at 0^th column: " << device << ", "
<< "GPU ID at column " << i << ": " << ptr_device;
}
dh::safe_cuda(cudaSetDevice(device));
GPUSet devices = GPUSet::Range(device, 1);
page_.offset.Reshard(GPUDistribution(devices));
page_.offset.Resize(info.num_row_ + 1);
page_.data.Reshard(GPUDistribution(devices));
page_.data.Resize(info.num_nonzero_);
auto s_data = page_.data.DeviceSpan(device);
auto s_offsets = page_.offset.DeviceSpan(device);
CHECK_EQ(s_offsets.size(), n_rows + 1);
int32_t constexpr kThreads = 256;
dh::device_vector<Columnar> d_cols(cols);
auto s_d_cols = dh::ToSpan(d_cols);
dh::safe_cuda(cudaMemset(s_offsets.data(), 0, sizeof(int32_t) * (n_rows + 1)));
CountValidKernel<kThreads><<<n_cols, kThreads>>>(s_d_cols, n_rows, s_offsets);
thrust::device_ptr<size_t> p_offsets(s_offsets.data());
CHECK_GE(s_offsets.size(), n_rows + 1);
thrust::inclusive_scan(p_offsets, p_offsets + n_rows + 1, p_offsets);
// Created for building csr matrix, where we need to change index
// after processing each column.
dh::device_vector<size_t> tmp_offset(page_.offset.Size());
thrust::copy(p_offsets, p_offsets + n_rows + 1, tmp_offset.begin());
int32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
for (size_t col = 0; col < n_cols; ++col) {
CreateCSRKernel<<<kBlocks, kThreads>>>(d_cols[col], col, dh::ToSpan(tmp_offset), s_data);
}
}
} // namespace data
} // namespace xgboost

View File

@@ -10,9 +10,12 @@
#include <xgboost/base.h>
#include <xgboost/data.h>
#include <vector>
#include <algorithm>
#include <algorithm>
#include <string>
#include <vector>
#include "columnar.h"
namespace xgboost {
namespace data {
@@ -27,7 +30,6 @@ namespace data {
*/
class SimpleCSRSource : public DataSource<SparsePage> {
public:
// public data members
// MetaInfo info; // inheritated from DataSource
SparsePage page_;
/*! \brief default constructor */
@@ -47,6 +49,11 @@ class SimpleCSRSource : public DataSource<SparsePage> {
* \param info The additional information reflected in the parser.
*/
void CopyFrom(dmlc::Parser<uint32_t>* src);
/*!
* \brief copy content of data from foreign **GPU** columnar buffer.
* \param interfaces_str JSON representation of cuda array interfaces.
*/
void CopyFrom(std::string const& cuda_interfaces_str);
/*!
* \brief Load data from binary stream.
* \param fi the pointer to load data from.
@@ -67,6 +74,11 @@ class SimpleCSRSource : public DataSource<SparsePage> {
static const int kMagic = 0xffffab01;
private:
/*!
* \brief copy content of data from foreign GPU columnar buffer.
* \param cols foreign columns data buffer.
*/
void FromDeviceColumnar(std::vector<Columnar> cols);
/*! \brief internal variable, used to support iterator interface */
bool at_first_{true};
};