Cudf support. (#4745)

* Initial support for cudf integration.

* Add two C APIs for consuming data and metainfo.

* Add CopyFrom for SimpleCSRSource as a generic function to consume the data.

* Add FromDeviceColumnar for consuming device data.

* Add new MetaInfo::SetInfo for consuming label, weight etc.
This commit is contained in:
Jiaming Yuan
2019-08-19 00:51:40 -04:00
committed by Rory Mitchell
parent ab357dd41c
commit 9700776597
26 changed files with 1385 additions and 287 deletions

View File

@@ -0,0 +1,92 @@
/*!
* Copyright 2019 XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../../../src/common/bitfield.h"
namespace xgboost {
TEST(BitField, Check) {
{
std::vector<LBitField64::value_type> storage(4, 0);
storage[2] = 2;
auto bits = LBitField64({storage.data(),
static_cast<typename common::Span<LBitField64::value_type>::index_type>(
storage.size())});
size_t true_bit = 190;
for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
ASSERT_FALSE(bits.Check(i));
}
ASSERT_TRUE(bits.Check(true_bit));
for (size_t i = 0; i < true_bit; ++i) {
ASSERT_FALSE(bits.Check(i));
}
}
{
std::vector<RBitField8::value_type> storage(4, 0);
storage[2] = 1 << 3;
auto bits = RBitField8({storage.data(),
static_cast<typename common::Span<RBitField8::value_type>::index_type>(
storage.size())});
size_t true_bit = 19;
for (size_t i = 0; i < true_bit; ++i) {
ASSERT_FALSE(bits.Check(i));
}
ASSERT_TRUE(bits.Check(true_bit));
for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
ASSERT_FALSE(bits.Check(i));
}
}
}
template <typename BitFieldT, typename VT = typename BitFieldT::value_type>
void TestBitFieldSet(typename BitFieldT::value_type res, size_t index, size_t true_bit) {
using IndexT = typename common::Span<VT>::index_type;
std::vector<VT> storage(4, 0);
auto bits = BitFieldT({storage.data(), static_cast<IndexT>(storage.size())});
bits.Set(true_bit);
for (size_t i = 0; i < true_bit; ++i) {
ASSERT_FALSE(bits.Check(i));
}
ASSERT_TRUE(bits.Check(true_bit));
for (size_t i = true_bit + 1; i < storage.size() * BitFieldT::kValueSize; ++i) {
ASSERT_FALSE(bits.Check(i));
}
ASSERT_EQ(storage[index], res);
}
TEST(BitField, Set) {
{
TestBitFieldSet<LBitField64>(2, 2, 190);
}
{
TestBitFieldSet<RBitField8>(1 << 3, 2, 19);
}
}
template <typename BitFieldT, typename VT = typename BitFieldT::value_type>
void TestBitFieldClear(size_t clear_bit) {
using IndexT = typename common::Span<VT>::index_type;
std::vector<VT> storage(4, 0);
auto bits = BitFieldT({storage.data(), static_cast<IndexT>(storage.size())});
bits.Set(clear_bit);
bits.Clear(clear_bit);
ASSERT_FALSE(bits.Check(clear_bit));
}
TEST(BitField, Clear) {
{
TestBitFieldClear<LBitField64>(190);
}
{
TestBitFieldClear<RBitField8>(19);
}
}
} // namespace xgboost

View File

@@ -5,56 +5,55 @@
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <vector>
#include "../../../src/common/bitfield.cuh"
#include "../../../src/common/bitfield.h"
#include "../../../src/common/device_helpers.cuh"
namespace xgboost {
__global__ void TestSetKernel(BitField bits) {
__global__ void TestSetKernel(LBitField64 bits) {
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < bits.Size()) {
bits.Set(tid);
}
}
TEST(BitField, Set) {
dh::device_vector<BitField::value_type> storage;
TEST(BitField, GPU_Set) {
dh::device_vector<LBitField64::value_type> storage;
uint32_t constexpr kBits = 128;
storage.resize(128);
auto bits = BitField(dh::ToSpan(storage));
auto bits = LBitField64(dh::ToSpan(storage));
TestSetKernel<<<1, kBits>>>(bits);
std::vector<BitField::value_type> h_storage(storage.size());
std::vector<LBitField64::value_type> h_storage(storage.size());
thrust::copy(storage.begin(), storage.end(), h_storage.begin());
BitField outputs {
common::Span<BitField::value_type>{h_storage.data(),
LBitField64 outputs {
common::Span<LBitField64::value_type>{h_storage.data(),
h_storage.data() + h_storage.size()}};
for (size_t i = 0; i < kBits; ++i) {
ASSERT_TRUE(outputs.Check(i));
}
}
__global__ void TestOrKernel(BitField lhs, BitField rhs) {
__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
lhs |= rhs;
}
TEST(BitField, And) {
TEST(BitField, GPU_And) {
uint32_t constexpr kBits = 128;
dh::device_vector<BitField::value_type> lhs_storage(kBits);
dh::device_vector<BitField::value_type> rhs_storage(kBits);
auto lhs = BitField(dh::ToSpan(lhs_storage));
auto rhs = BitField(dh::ToSpan(rhs_storage));
dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
auto lhs = LBitField64(dh::ToSpan(lhs_storage));
auto rhs = LBitField64(dh::ToSpan(rhs_storage));
thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<BitField::value_type>(0UL));
thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
TestOrKernel<<<1, kBits>>>(lhs, rhs);
std::vector<BitField::value_type> h_storage(lhs_storage.size());
std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
BitField outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
for (size_t i = 0; i < kBits; ++i) {
ASSERT_TRUE(outputs.Check(i));
}
}
} // namespace xgboost

View File

@@ -0,0 +1,57 @@
/*! Copyright 2019 by Contributors */
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include <xgboost/json.h>
#include <thrust/device_vector.h>
#include "../../../src/common/device_helpers.cuh"
namespace xgboost {
TEST(MetaInfo, FromInterface) {
cudaSetDevice(0);
constexpr size_t kRows = 16;
thrust::device_vector<float> d_data(kRows);
for (size_t i = 0; i < d_data.size(); ++i) {
d_data[i] = i * 2.0;
}
Json column { Object() };
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
auto p_d_data = dh::Raw(d_data);
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
std::stringstream ss;
Json::Dump(column, &ss);
std::string str = ss.str();
MetaInfo info;
info.SetInfo("label", str.c_str());
auto const& h_label = info.labels_.HostVector();
for (size_t i = 0; i < d_data.size(); ++i) {
ASSERT_EQ(h_label[i], d_data[i]);
}
info.SetInfo("weight", str.c_str());
auto const& h_weight = info.weights_.HostVector();
for (size_t i = 0; i < d_data.size(); ++i) {
ASSERT_EQ(h_weight[i], d_data[i]);
}
info.SetInfo("base_margin", str.c_str());
auto const& h_base_margin = info.base_margin_.HostVector();
for (size_t i = 0; i < d_data.size(); ++i) {
ASSERT_EQ(h_base_margin[i], d_data[i]);
}
}
} // namespace xgboost

View File

@@ -1,10 +1,15 @@
// Copyright by Contributors
#include <xgboost/data.h>
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <xgboost/data.h>
#include <xgboost/json.h>
#include "../../../src/data/simple_csr_source.h"
#include "../helpers.h"
namespace xgboost {
TEST(SimpleCSRSource, SaveLoadBinary) {
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/simple.libsvm";
@@ -33,3 +38,4 @@ TEST(SimpleCSRSource, SaveLoadBinary) {
delete dmat;
delete dmat_read;
}
} // namespace xgboost

View File

@@ -0,0 +1,216 @@
// Copyright (c) 2019 by Contributors
#include <gtest/gtest.h>
#include <xgboost/data.h>
#include <xgboost/json.h>
#include <thrust/device_vector.h>
#include <memory>
#include "../../../src/common/bitfield.h"
#include "../../../src/common/device_helpers.cuh"
#include "../../../src/data/simple_csr_source.h"
namespace xgboost {
TEST(SimpleCSRSource, FromColumnarDense) {
constexpr size_t kRows = 16;
Json column { Object() };
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
thrust::device_vector<float> d_data(kRows);
for (size_t i = 0; i < d_data.size(); ++i) {
d_data[i] = i * 2.0;
}
auto p_d_data = dh::Raw(d_data);
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
Json column_arr {Array{std::vector<Json>{column}}};
std::stringstream ss;
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
for (size_t i = 0; i < kRows; ++i) {
auto e = data[i];
ASSERT_NEAR(e.fvalue, i * 2.0, kRtEps);
ASSERT_EQ(e.index, 0); // feature 0
}
ASSERT_EQ(offset.back(), 16);
for (size_t i = 0; i < kRows + 1; ++i) {
ASSERT_EQ(offset[i], i);
}
}
TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
// In this test we construct a data storage similar to cudf
constexpr size_t kRows = 102;
constexpr size_t kCols = 24;
constexpr size_t kMissingRows = 3;
std::vector<Json> v_columns (kCols);
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
unsigned char constexpr kUCOne = 1;
for (size_t i = 0; i < kCols; ++i) {
auto& col = v_columns[i];
col = Object();
auto& data = columns_data[i];
data.resize(kRows);
thrust::sequence(data.begin(), data.end(), 0);
dh::safe_cuda(cudaDeviceSynchronize());
dh::safe_cuda(cudaGetLastError());
ASSERT_EQ(data.size(), kRows);
auto p_d_data = raw_pointer_cast(data.data());
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
col["data"] = j_data;
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
col["shape"] = Array(j_shape);
col["version"] = Integer(static_cast<Integer::Int>(1));
col["typestr"] = String("<f4");
// Construct the mask object.
col["mask"] = Object();
auto& j_mask = col["mask"];
auto& mask_storage = column_bitfields[i];
mask_storage.resize(16); // 16 bytes
mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing
mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing
size_t last_ind = 12;
mask_storage[last_ind] = ~(kUCOne << 5);
std::set<size_t> missing_row_index {0, 1, last_ind};
for (size_t i = 0; i < mask_storage.size(); ++i) {
if (missing_row_index.find(i) == missing_row_index.cend()) {
// all other rows are valid
mask_storage[i] = ~0;
}
}
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(16)))});
j_mask["typestr"] = String("|i1");
j_mask["null_count"] = Json(Integer(static_cast<Integer::Int>(kMissingRows)));
}
Json column_arr {Array(v_columns)};
std::stringstream ss;
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
ASSERT_EQ(offset.size(), kRows + 1);
for (size_t i = 1; i < offset.size(); ++i) {
for (size_t j = offset[i-1]; j < offset[i]; ++j) {
ASSERT_EQ(data[j].index, j % kCols);
ASSERT_NEAR(data[j].fvalue, i - 1, kRtEps);
}
}
}
TEST(SimpleCSRSource, FromColumnarSparse) {
constexpr size_t kRows = 32;
constexpr size_t kCols = 2;
unsigned char constexpr kUCOne = 1;
std::vector<dh::device_vector<float>> columns_data(kCols);
std::vector<dh::device_vector<unsigned char>> column_bitfields(kCols);
{
// column 0
auto& mask = column_bitfields[0];
mask.resize(8);
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
mask[0] = ~(kUCOne << 2);
}
{
// column 1
auto& mask = column_bitfields[1];
mask.resize(8);
for (size_t j = 0; j < mask.size(); ++j) {
mask[j] = ~0;
}
mask[2] = ~(kUCOne << 3);
}
for (size_t c = 0; c < kCols; ++c) {
columns_data[c].resize(kRows);
thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0);
}
std::vector<Json> j_columns(kCols);
for (size_t c = 0; c < kCols; ++c) {
auto& column = j_columns[c];
column = Object();
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
auto p_d_data = raw_pointer_cast(columns_data[c].data());
std::vector<Json> j_data {
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
Json(Boolean(false))};
column["data"] = j_data;
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
column["shape"] = Array(j_shape);
column["version"] = Integer(static_cast<Integer::Int>(1));
column["typestr"] = String("<f4");
column["mask"] = Object();
auto& j_mask = column["mask"];
j_mask["data"] = std::vector<Json>{
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
Json(Boolean(false))};
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(8)))});
j_mask["typestr"] = String("|i1");
j_mask["null_count"] = Json(Integer(static_cast<Integer::Int>(1)));
}
Json column_arr {Array(j_columns)};
std::stringstream ss;
Json::Dump(column_arr, &ss);
std::string str = ss.str();
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
source->CopyFrom(str.c_str());
auto const& data = source->page_.data.HostVector();
auto const& offset = source->page_.offset.HostVector();
ASSERT_EQ(offset.size(), kRows + 1);
ASSERT_EQ(data[4].index, 1);
ASSERT_EQ(data[4].fvalue, 2);
ASSERT_EQ(data[37].index, 0);
ASSERT_EQ(data[37].fvalue, 19);
}
} // namespace xgboost

View File

@@ -16,7 +16,7 @@ namespace xgboost {
namespace {
struct FConstraintWrapper : public FeatureInteractionConstraint {
common::Span<BitField> GetNodeConstraints() {
common::Span<LBitField64> GetNodeConstraints() {
return FeatureInteractionConstraint::s_node_constraints_;
}
FConstraintWrapper(tree::TrainParam param, int32_t n_features) :
@@ -44,13 +44,13 @@ tree::TrainParam GetParameter() {
return param;
}
void CompareBitField(BitField d_field, std::set<uint32_t> positions) {
std::vector<BitField::value_type> h_field_storage(d_field.bits_.size());
thrust::copy(thrust::device_ptr<BitField::value_type>(d_field.bits_.data()),
thrust::device_ptr<BitField::value_type>(
void CompareBitField(LBitField64 d_field, std::set<uint32_t> positions) {
std::vector<LBitField64::value_type> h_field_storage(d_field.bits_.size());
thrust::copy(thrust::device_ptr<LBitField64::value_type>(d_field.bits_.data()),
thrust::device_ptr<LBitField64::value_type>(
d_field.bits_.data() + d_field.bits_.size()),
h_field_storage.data());
BitField h_field;
LBitField64 h_field;
h_field.bits_ = {h_field_storage.data(), h_field_storage.data() + h_field_storage.size()};
for (size_t i = 0; i < h_field.Size(); ++i) {
@@ -71,14 +71,14 @@ TEST(FeatureInteractionConstraint, Init) {
tree::TrainParam param = GetParameter();
FConstraintWrapper constraints(param, kFeatures);
ASSERT_EQ(constraints.Features(), kFeatures);
common::Span<BitField> s_nodes_constraints = constraints.GetNodeConstraints();
for (BitField const& d_node : s_nodes_constraints) {
std::vector<BitField::value_type> h_node_storage(d_node.bits_.size());
thrust::copy(thrust::device_ptr<BitField::value_type>(d_node.bits_.data()),
thrust::device_ptr<BitField::value_type>(
common::Span<LBitField64> s_nodes_constraints = constraints.GetNodeConstraints();
for (LBitField64 const& d_node : s_nodes_constraints) {
std::vector<LBitField64::value_type> h_node_storage(d_node.bits_.size());
thrust::copy(thrust::device_ptr<LBitField64::value_type>(d_node.bits_.data()),
thrust::device_ptr<LBitField64::value_type>(
d_node.bits_.data() + d_node.bits_.size()),
h_node_storage.data());
BitField h_node;
LBitField64 h_node;
h_node.bits_ = {h_node_storage.data(), h_node_storage.data() + h_node_storage.size()};
// no feature is attached to node.
for (size_t i = 0; i < h_node.Size(); ++i) {
@@ -108,7 +108,7 @@ TEST(FeatureInteractionConstraint, Init) {
}
{
// Test having more than 1 BitField::value_type
// Test having more than 1 LBitField64::value_type
int32_t constexpr kFeatures = 129;
tree::TrainParam param = GetParameter();
param.interaction_constraints = R"([[0, 1, 3], [3, 5, 128], [127, 128]])";
@@ -129,7 +129,7 @@ TEST(FeatureInteractionConstraint, Split) {
FConstraintWrapper constraints(param, kFeatures);
{
BitField d_node[3];
LBitField64 d_node[3];
constraints.Split(0, /*feature_id=*/1, 1, 2);
for (size_t nid = 0; nid < 3; ++nid) {
d_node[nid] = constraints.GetNodeConstraints()[nid];
@@ -139,7 +139,7 @@ TEST(FeatureInteractionConstraint, Split) {
}
{
BitField d_node[5];
LBitField64 d_node[5];
constraints.Split(1, /*feature_id=*/0, /*left_id=*/3, /*right_id=*/4);
for (auto nid : {1, 3, 4}) {
d_node[nid] = constraints.GetNodeConstraints()[nid];

View File

@@ -0,0 +1,44 @@
import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
import testing as tm
pytestmark = pytest.mark.skipif(**tm.no_cudf())
class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@pytest.mark.skipif(**tm.no_cudf())
def test_from_cudf():
'''Test constructing DMatrix from cudf'''
import cudf
import pandas as pd
kRows = 80
kCols = 2
na = np.random.randn(kRows, kCols).astype(np.float32)
na[3, 1] = np.NAN
na[5, 0] = np.NAN
pa = pd.DataFrame(na)
np_label = np.random.randn(kRows).astype(np.float32)
pa_label = pd.DataFrame(np_label)
names = []
for i in range(0, kCols):
names.append(str(i))
pa.columns = names
cd: cudf.DataFrame = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label)
dtrain = xgb.DMatrix(cd, label=cd_label)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows

View File

@@ -1,5 +1,6 @@
# coding: utf-8
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED, DASK_INSTALLED
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED
from xgboost.compat import CUDF_INSTALLED, DASK_INSTALLED
def no_sklearn():
@@ -31,3 +32,8 @@ def no_matplotlib():
except ImportError:
return {'condition': True,
'reason': reason}
def no_cudf():
return {'condition': not CUDF_INSTALLED,
'reason': 'CUDF is not installed'}