Implement cudf construction with adapters. (#5189)
This commit is contained in:
parent
9559f81377
commit
87ebfc1315
@ -148,7 +148,7 @@ struct Entry {
|
||||
* \param index The feature or row index.
|
||||
* \param fvalue The feature value.
|
||||
*/
|
||||
Entry(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||
XGBOOST_DEVICE Entry(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
|
||||
/*! \brief reversely compare feature values */
|
||||
inline static bool CmpValue(const Entry& a, const Entry& b) {
|
||||
return a.fvalue < b.fvalue;
|
||||
|
||||
@ -204,15 +204,14 @@ int XGDMatrixCreateFromDataIter(
|
||||
API_END();
|
||||
}
|
||||
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
XGB_DLL int XGDMatrixCreateFromArrayInterfaces(
|
||||
char const* c_json_strs, bst_int has_missing, bst_float missing, DMatrixHandle* out) {
|
||||
char const* c_json_strs, bst_int has_missing, bst_float missing, DMatrixHandle* out) {
|
||||
API_BEGIN();
|
||||
std::string json_str {c_json_strs};
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(json_str, has_missing, missing);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
LOG(FATAL) << "Xgboost not compiled with cuda";
|
||||
API_END();
|
||||
}
|
||||
#endif
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||
const unsigned* indices,
|
||||
|
||||
20
src/c_api/c_api.cu
Normal file
20
src/c_api/c_api.cu
Normal file
@ -0,0 +1,20 @@
|
||||
// Copyright (c) 2014-2019 by Contributors
|
||||
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/c_api.h"
|
||||
#include "c_api_error.h"
|
||||
#include "../data/simple_csr_source.h"
|
||||
#include "../data/device_adapter.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
XGB_DLL int XGDMatrixCreateFromArrayInterfaces(char const* c_json_strs,
|
||||
bst_int has_missing,
|
||||
bst_float missing,
|
||||
DMatrixHandle* out) {
|
||||
API_BEGIN();
|
||||
std::string json_str{c_json_strs};
|
||||
data::CudfAdapter adapter(json_str);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, 1));
|
||||
API_END();
|
||||
}
|
||||
} // namespace xgboost
|
||||
@ -7,6 +7,7 @@
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <thrust/logical.h>
|
||||
|
||||
#include <omp.h>
|
||||
#include <rabit/rabit.h>
|
||||
@ -372,6 +373,10 @@ public:
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_.RegisterDeallocation(ptr, n, current_device);
|
||||
}
|
||||
size_t PeakMemory()
|
||||
{
|
||||
return stats_.peak_allocated_bytes;
|
||||
}
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
|
||||
@ -4,9 +4,11 @@
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_ADAPTER_H_
|
||||
#define XGBOOST_DATA_ADAPTER_H_
|
||||
#include <dmlc/data.h>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
@ -56,7 +58,7 @@ namespace data {
|
||||
constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
|
||||
|
||||
struct COOTuple {
|
||||
COOTuple(size_t row_idx, size_t column_idx, float value)
|
||||
XGBOOST_DEVICE COOTuple(size_t row_idx, size_t column_idx, float value)
|
||||
: row_idx(row_idx), column_idx(column_idx), value(value) {}
|
||||
|
||||
size_t row_idx{0};
|
||||
|
||||
@ -225,6 +225,7 @@ class Columnar {
|
||||
using index_type = int32_t;
|
||||
|
||||
public:
|
||||
Columnar() = default;
|
||||
explicit Columnar(std::map<std::string, Json> const& column) {
|
||||
ArrayInterfaceHandler::Validate(column);
|
||||
data = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(column);
|
||||
|
||||
@ -9,6 +9,8 @@
|
||||
#include "xgboost/json.h"
|
||||
#include "columnar.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "device_adapter.cuh"
|
||||
#include "simple_dmatrix.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -67,4 +69,17 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||
LOG(FATAL) << "Unknown metainfo: " << key;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename AdapterT>
|
||||
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
||||
const std::string& cache_prefix, size_t page_size) {
|
||||
CHECK_EQ(cache_prefix.size(), 0)
|
||||
<< "Device memory construction is not currently supported with external "
|
||||
"memory.";
|
||||
return new data::SimpleDMatrix(adapter, missing, nthread);
|
||||
}
|
||||
|
||||
template DMatrix* DMatrix::Create<data::CudfAdapter>(
|
||||
data::CudfAdapter* adapter, float missing, int nthread,
|
||||
const std::string& cache_prefix, size_t page_size);
|
||||
} // namespace xgboost
|
||||
|
||||
95
src/data/device_adapter.cuh
Normal file
95
src/data/device_adapter.cuh
Normal file
@ -0,0 +1,95 @@
|
||||
/*!
|
||||
* Copyright (c) 2019 by Contributors
|
||||
* \file device_adapter.cuh
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
|
||||
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "columnar.h"
|
||||
#include "adapter.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
CudfAdapterBatch() = default;
|
||||
CudfAdapterBatch(common::Span<Columnar> columns,
|
||||
common::Span<size_t> column_ptr, size_t num_elements)
|
||||
: columns_(columns),column_ptr_(column_ptr), num_elements(num_elements) {}
|
||||
size_t Size()const { return num_elements; }
|
||||
__device__ COOTuple GetElement(size_t idx)const
|
||||
{
|
||||
size_t column_idx =
|
||||
dh::UpperBound(column_ptr_.data(), column_ptr_.size(), idx) - 1;
|
||||
auto& column = columns_[column_idx];
|
||||
size_t row_idx = idx - column_ptr_[column_idx];
|
||||
float value = column.valid.Data() == nullptr || column.valid.Check(row_idx)
|
||||
? column.GetElement(row_idx)
|
||||
: std::numeric_limits<float>::quiet_NaN();
|
||||
return COOTuple(row_idx, column_idx, value);
|
||||
}
|
||||
|
||||
private:
|
||||
common::Span<Columnar> columns_;
|
||||
common::Span<size_t> column_ptr_;
|
||||
size_t num_elements;
|
||||
};
|
||||
|
||||
class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
public:
|
||||
explicit CudfAdapter(std::string cuda_interfaces_str) {
|
||||
Json interfaces =
|
||||
Json::Load({cuda_interfaces_str.c_str(), cuda_interfaces_str.size()});
|
||||
std::vector<Json> const& json_columns = get<Array>(interfaces);
|
||||
size_t n_columns = json_columns.size();
|
||||
CHECK_GT(n_columns, 0) << "Number of columns must not equal to 0.";
|
||||
|
||||
auto const& typestr = get<String const>(json_columns[0]["typestr"]);
|
||||
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
|
||||
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
|
||||
std::vector<Columnar> columns;
|
||||
std::vector<size_t> column_ptr({0});
|
||||
auto first_column = Columnar(get<Object const>(json_columns[0]));
|
||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
||||
CHECK_NE(device_idx_, -1);
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
num_rows_ = first_column.size;
|
||||
for (auto& json_col : json_columns) {
|
||||
auto column = Columnar(get<Object const>(json_col));
|
||||
columns.push_back(column);
|
||||
column_ptr.emplace_back(column_ptr.back() + column.size);
|
||||
num_rows_ = std::max(num_rows_, size_t(column.size));
|
||||
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
|
||||
<< "All columns should use the same device.";
|
||||
CHECK_EQ(num_rows_, column.size)
|
||||
<< "All columns should have same number of rows.";
|
||||
}
|
||||
columns_ = columns;
|
||||
column_ptr_ = column_ptr;
|
||||
batch = CudfAdapterBatch(dh::ToSpan(columns_), dh::ToSpan(column_ptr_),
|
||||
column_ptr.back());
|
||||
}
|
||||
const CudfAdapterBatch& Value() const override { return batch; }
|
||||
|
||||
size_t NumRows() const { return num_rows_; }
|
||||
size_t NumColumns() const { return columns_.size(); }
|
||||
size_t DeviceIdx()const {
|
||||
return device_idx_;
|
||||
}
|
||||
|
||||
// Cudf is column major
|
||||
bool IsRowMajor() { return false; }
|
||||
private:
|
||||
CudfAdapterBatch batch;
|
||||
dh::device_vector<Columnar> columns_;
|
||||
dh::device_vector<size_t> column_ptr_; // Exclusive scan of column sizes
|
||||
size_t num_rows_{0};
|
||||
int device_idx_;
|
||||
};
|
||||
}; // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_DEVICE_ADAPTER_H_
|
||||
120
src/data/simple_dmatrix.cu
Normal file
120
src/data/simple_dmatrix.cu
Normal file
@ -0,0 +1,120 @@
|
||||
/*!
|
||||
* Copyright 2019 by Contributors
|
||||
* \file simple_dmatrix.cu
|
||||
*/
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../common/random.h"
|
||||
#include "./simple_dmatrix.h"
|
||||
#include "device_adapter.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
XGBOOST_DEVICE bool IsValid(float value, float missing) {
|
||||
if (common::CheckNAN(value) || value == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename AdapterBatchT>
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
int device_idx, float missing) {
|
||||
// Count elements per row
|
||||
dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) {
|
||||
auto element = batch.GetElement(idx);
|
||||
if (IsValid(element.value, missing)) {
|
||||
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
|
||||
&offset[element.row_idx]),
|
||||
static_cast<unsigned long long>(1)); // NOLINT
|
||||
}
|
||||
});
|
||||
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc),
|
||||
thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
thrust::device_pointer_cast(offset.data()));
|
||||
}
|
||||
|
||||
template <typename AdapterT>
|
||||
void CopyDataColumnMajor(AdapterT* adapter, common::Span<Entry> data,
|
||||
int device_idx, float missing,
|
||||
common::Span<size_t> row_ptr) {
|
||||
// Step 1: Get the sizes of the input columns
|
||||
dh::device_vector<size_t> column_sizes(adapter->NumColumns());
|
||||
auto d_column_sizes = column_sizes.data().get();
|
||||
auto& batch = adapter->Value();
|
||||
// Populate column sizes
|
||||
dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) {
|
||||
const auto& e = batch.GetElement(idx);
|
||||
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
|
||||
&d_column_sizes[e.column_idx]),
|
||||
static_cast<unsigned long long>(1)); // NOLINT
|
||||
});
|
||||
|
||||
thrust::host_vector<size_t> host_column_sizes = column_sizes;
|
||||
|
||||
// Step 2: Iterate over columns, place elements in correct row, increment
|
||||
// temporary row pointers
|
||||
dh::device_vector<size_t> temp_row_ptr(
|
||||
thrust::device_pointer_cast(row_ptr.data()),
|
||||
thrust::device_pointer_cast(row_ptr.data() + row_ptr.size()));
|
||||
auto d_temp_row_ptr = temp_row_ptr.data().get();
|
||||
size_t begin = 0;
|
||||
for (auto size : host_column_sizes) {
|
||||
size_t end = begin + size;
|
||||
dh::LaunchN(device_idx, end - begin, [=] __device__(size_t idx) {
|
||||
const auto& e = batch.GetElement(idx + begin);
|
||||
if (!IsValid(e.value, missing)) return;
|
||||
data[d_temp_row_ptr[e.row_idx]] = Entry(e.column_idx, e.value);
|
||||
d_temp_row_ptr[e.row_idx] += 1;
|
||||
});
|
||||
|
||||
begin = end;
|
||||
}
|
||||
}
|
||||
|
||||
// Does not currently support metainfo as no on-device data source contains this
|
||||
// Current implementation assumes a single batch. More batches can
|
||||
// be supported in future. Does not currently support inferring row/column size
|
||||
template <typename AdapterT>
|
||||
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
source_.reset(new SimpleCSRSource());
|
||||
SimpleCSRSource& mat = *reinterpret_cast<SimpleCSRSource*>(source_.get());
|
||||
CHECK(adapter->NumRows() != kAdapterUnknownSize);
|
||||
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
|
||||
|
||||
adapter->BeforeFirst();
|
||||
adapter->Next();
|
||||
auto& batch = adapter->Value();
|
||||
mat.page_.offset.SetDevice(adapter->DeviceIdx());
|
||||
mat.page_.data.SetDevice(adapter->DeviceIdx());
|
||||
|
||||
// Enforce single batch
|
||||
CHECK(!adapter->Next());
|
||||
mat.page_.offset.Resize(adapter->NumRows() + 1);
|
||||
auto s_offset = mat.page_.offset.DeviceSpan();
|
||||
CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing);
|
||||
mat.info.num_nonzero_ = mat.page_.offset.HostVector().back();
|
||||
mat.page_.data.Resize(mat.info.num_nonzero_);
|
||||
if (adapter->IsRowMajor()) {
|
||||
LOG(FATAL) << "Not implemented.";
|
||||
} else {
|
||||
CopyDataColumnMajor(adapter, mat.page_.data.DeviceSpan(),
|
||||
adapter->DeviceIdx(), missing, s_offset);
|
||||
}
|
||||
|
||||
mat.info.num_col_ = adapter->NumColumns();
|
||||
mat.info.num_row_ = adapter->NumRows();
|
||||
// Synchronise worker columns
|
||||
rabit::Allreduce<rabit::op::Max>(&mat.info.num_col_, 1);
|
||||
}
|
||||
|
||||
template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
|
||||
int nthread);
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
@ -6,7 +6,7 @@
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
TEST(c_api, CSRAdapter) {
|
||||
TEST(adapter, CSRAdapter) {
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5};
|
||||
@ -29,7 +29,7 @@ TEST(c_api, CSRAdapter) {
|
||||
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
|
||||
}
|
||||
|
||||
TEST(c_api, CSCAdapterColsMoreThanRows) {
|
||||
TEST(adapter, CSCAdapterColsMoreThanRows) {
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 0, 1, 0, 1};
|
||||
std::vector<size_t> col_ptr = {0, 2, 4, 6, 8};
|
||||
|
||||
65
tests/cpp/data/test_columnar.h
Normal file
65
tests/cpp/data/test_columnar.h
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/json.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <memory>
|
||||
#include "../../../src/common/bitfield.h"
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/data/simple_csr_source.h"
|
||||
#include "../../../src/data/columnar.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
template <typename T>
|
||||
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
|
||||
thrust::device_vector<T>* out_d_data) {
|
||||
auto& d_data = *out_d_data;
|
||||
d_data.resize(kRows);
|
||||
Json column { Object() };
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
|
||||
|
||||
d_data.resize(kRows);
|
||||
thrust::sequence(thrust::device, d_data.begin(), d_data.end(), 0.0f, 2.0f);
|
||||
|
||||
auto p_d_data = dh::Raw(d_data);
|
||||
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String(typestr);
|
||||
return column;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
|
||||
thrust::device_vector<T>* out_d_data) {
|
||||
auto& d_data = *out_d_data;
|
||||
Json column { Object() };
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
|
||||
|
||||
d_data.resize(kRows);
|
||||
for (size_t i = 0; i < d_data.size(); ++i) {
|
||||
d_data[i] = i * 2.0;
|
||||
}
|
||||
|
||||
auto p_d_data = dh::Raw(d_data);
|
||||
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String(typestr);
|
||||
return column;
|
||||
}
|
||||
} // namespace xgboost
|
||||
55
tests/cpp/data/test_device_adapter.cu
Normal file
55
tests/cpp/data/test_device_adapter.cu
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
#include <thrust/device_vector.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
#include "test_columnar.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
void TestCudfAdapter()
|
||||
{
|
||||
constexpr size_t kRowsA {16};
|
||||
constexpr size_t kRowsB {16};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<double> d_data_0(kRowsA);
|
||||
thrust::device_vector<uint32_t> d_data_1(kRowsB);
|
||||
|
||||
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRowsA, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRowsB, &d_data_1));
|
||||
|
||||
Json column_arr {columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
data::CudfAdapter adapter(str);
|
||||
|
||||
adapter.Next();
|
||||
auto & batch = adapter.Value();
|
||||
EXPECT_EQ(batch.Size(), kRowsA + kRowsB);
|
||||
|
||||
EXPECT_NO_THROW({
|
||||
dh::LaunchN(0, batch.Size(), [=] __device__(size_t idx) {
|
||||
auto element = batch.GetElement(idx);
|
||||
if (idx < kRowsA) {
|
||||
KERNEL_CHECK(element.column_idx == 0);
|
||||
KERNEL_CHECK(element.row_idx == idx);
|
||||
KERNEL_CHECK(element.value == element.row_idx * 2.0f);
|
||||
} else {
|
||||
KERNEL_CHECK(element.column_idx == 1);
|
||||
KERNEL_CHECK(element.row_idx == idx - kRowsA);
|
||||
KERNEL_CHECK(element.value == element.row_idx * 2.0f);
|
||||
}
|
||||
});
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
});
|
||||
}
|
||||
|
||||
TEST(device_adapter, CudfAdapter) {
|
||||
TestCudfAdapter();
|
||||
}
|
||||
@ -9,6 +9,7 @@
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/data/simple_csr_source.h"
|
||||
#include "../../../src/data/columnar.h"
|
||||
#include "test_columnar.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -49,31 +50,6 @@ TEST(ArrayInterfaceHandler, Error) {
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
|
||||
thrust::device_vector<T>* out_d_data) {
|
||||
auto& d_data = *out_d_data;
|
||||
Json column { Object() };
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
|
||||
|
||||
d_data.resize(kRows);
|
||||
for (size_t i = 0; i < d_data.size(); ++i) {
|
||||
d_data[i] = i * 2.0;
|
||||
}
|
||||
|
||||
auto p_d_data = dh::Raw(d_data);
|
||||
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String(typestr);
|
||||
return column;
|
||||
}
|
||||
|
||||
void TestGetElement() {
|
||||
thrust::device_vector<float> data;
|
||||
|
||||
318
tests/cpp/data/test_simple_dmatrix.cu
Normal file
318
tests/cpp/data/test_simple_dmatrix.cu
Normal file
@ -0,0 +1,318 @@
|
||||
// Copyright by Contributors
|
||||
#include <dmlc/filesystem.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
|
||||
#include <thrust/sequence.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
#include "../helpers.h"
|
||||
#include "test_columnar.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
TEST(SimpleDMatrix, FromColumnarDenseBasic) {
|
||||
constexpr size_t kRows{16};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<double> d_data_0(kRows);
|
||||
thrust::device_vector<uint32_t> d_data_1(kRows);
|
||||
|
||||
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr{columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 16);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
|
||||
}
|
||||
|
||||
void TestDenseColumn(DMatrix* dmat, size_t n_rows, size_t n_cols) {
|
||||
for (auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, i * 2);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(dmat->Info().num_row_, n_rows);
|
||||
ASSERT_EQ(dmat->Info().num_col_, n_cols);
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, FromColumnarDense) {
|
||||
constexpr size_t kRows{16};
|
||||
constexpr size_t kCols{2};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<float> d_data_0(kRows);
|
||||
thrust::device_vector<int32_t> d_data_1(kRows);
|
||||
columns.emplace_back(GenerateDenseColumn<float>("<f4", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<int32_t>("<i4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr{columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
// no missing value
|
||||
{
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
TestDenseColumn(&dmat, kRows, kCols);
|
||||
}
|
||||
|
||||
// with missing value specified
|
||||
{
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, 4.0, -1);
|
||||
|
||||
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
||||
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
||||
ASSERT_EQ(dmat.Info().num_nonzero_, kCols * kRows - 2);
|
||||
}
|
||||
|
||||
{
|
||||
// no missing value, but has NaN
|
||||
d_data_0[3] = std::numeric_limits<float>::quiet_NaN();
|
||||
ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 1);
|
||||
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
||||
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, FromColumnarWithEmptyRows) {
|
||||
constexpr size_t kRows = 102;
|
||||
constexpr size_t kCols = 24;
|
||||
|
||||
std::vector<Json> v_columns(kCols);
|
||||
std::vector<dh::device_vector<float>> columns_data(kCols);
|
||||
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(
|
||||
kCols);
|
||||
|
||||
RBitField8::value_type constexpr kUCOne = 1;
|
||||
|
||||
for (size_t i = 0; i < kCols; ++i) {
|
||||
auto& col = v_columns[i];
|
||||
col = Object();
|
||||
auto& data = columns_data[i];
|
||||
data.resize(kRows);
|
||||
thrust::sequence(data.begin(), data.end(), 0);
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
|
||||
ASSERT_EQ(data.size(), kRows);
|
||||
|
||||
auto p_d_data = raw_pointer_cast(data.data());
|
||||
std::vector<Json> j_data{
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
col["data"] = j_data;
|
||||
std::vector<Json> j_shape{Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
col["shape"] = Array(j_shape);
|
||||
col["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
col["typestr"] = String("<f4");
|
||||
|
||||
// Construct the mask object.
|
||||
col["mask"] = Object();
|
||||
auto& j_mask = col["mask"];
|
||||
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
auto& mask_storage = column_bitfields[i];
|
||||
mask_storage.resize(16); // 16 bytes
|
||||
|
||||
mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing
|
||||
mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing
|
||||
size_t last_ind = 12;
|
||||
mask_storage[last_ind] = ~(kUCOne << 5);
|
||||
std::set<size_t> missing_row_index{0, 1, last_ind};
|
||||
|
||||
for (size_t j = 0; j < mask_storage.size(); ++j) {
|
||||
if (missing_row_index.find(j) == missing_row_index.cend()) {
|
||||
// all other rows are valid
|
||||
mask_storage[j] = ~0;
|
||||
}
|
||||
}
|
||||
|
||||
j_mask["data"] = std::vector<Json>{
|
||||
Json(
|
||||
Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
|
||||
Json(Boolean(false))};
|
||||
j_mask["shape"] = Array(
|
||||
std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
||||
j_mask["typestr"] = String("|i1");
|
||||
}
|
||||
|
||||
Json column_arr{Array(v_columns)};
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
|
||||
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, i);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows - 3) * kCols);
|
||||
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
||||
ASSERT_EQ(dmat.Info().num_col_, kCols);
|
||||
}
|
||||
|
||||
TEST(SimpleCSRSource, FromColumnarSparse) {
|
||||
constexpr size_t kRows = 32;
|
||||
constexpr size_t kCols = 2;
|
||||
RBitField8::value_type constexpr kUCOne = 1;
|
||||
|
||||
std::vector<dh::device_vector<float>> columns_data(kCols);
|
||||
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
|
||||
|
||||
{
|
||||
// column 0
|
||||
auto& mask = column_bitfields[0];
|
||||
mask.resize(8);
|
||||
|
||||
for (size_t j = 0; j < mask.size(); ++j) {
|
||||
mask[j] = ~0;
|
||||
}
|
||||
// the 2^th entry of first column is invalid
|
||||
// [0 0 0 0 0 1 0 0]
|
||||
mask[0] = ~(kUCOne << 2);
|
||||
}
|
||||
{
|
||||
// column 1
|
||||
auto& mask = column_bitfields[1];
|
||||
mask.resize(8);
|
||||
|
||||
for (size_t j = 0; j < mask.size(); ++j) {
|
||||
mask[j] = ~0;
|
||||
}
|
||||
// the 19^th entry of second column is invalid
|
||||
// [~0~], [~0~], [0 0 0 0 1 0 0 0]
|
||||
mask[2] = ~(kUCOne << 3);
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
columns_data[c].resize(kRows);
|
||||
thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0);
|
||||
}
|
||||
|
||||
std::vector<Json> j_columns(kCols);
|
||||
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
auto& column = j_columns[c];
|
||||
column = Object();
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String("<f4");
|
||||
auto p_d_data = raw_pointer_cast(columns_data[c].data());
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String("<f4");
|
||||
|
||||
column["mask"] = Object();
|
||||
auto& j_mask = column["mask"];
|
||||
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
j_mask["data"] = std::vector<Json>{
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
|
||||
Json(Boolean(false))};
|
||||
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
||||
j_mask["typestr"] = String("|i1");
|
||||
}
|
||||
|
||||
Json column_arr {Array(j_columns)};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
{
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), -1);
|
||||
|
||||
ASSERT_EQ(dmat.Info().num_row_, kRows);
|
||||
ASSERT_EQ(dmat.Info().num_nonzero_, (kRows*kCols)-2);
|
||||
}
|
||||
|
||||
{
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, 2.0, -1);
|
||||
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (auto e : inst) {
|
||||
ASSERT_NE(e.fvalue, 2.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// no missing value, but has NaN
|
||||
data::CudfAdapter adapter(str);
|
||||
columns_data[0][4] = std::numeric_limits<float>::quiet_NaN(); // 0^th column 4^th row
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
ASSERT_TRUE(std::isnan(columns_data[0][4]));
|
||||
|
||||
// Two invalid entries and one NaN, in CSC
|
||||
// 0^th column: 0, 1, 4, 5, 6, ..., kRows
|
||||
// 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows
|
||||
ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(SimpleDMatrix, FromColumnarSparseBasic) {
|
||||
constexpr size_t kRows{16};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<double> d_data_0(kRows);
|
||||
thrust::device_vector<uint32_t> d_data_1(kRows);
|
||||
|
||||
columns.emplace_back(GenerateSparseColumn<double>("<f8", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateSparseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr{columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
data::CudfAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1);
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 16);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 32);
|
||||
|
||||
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, i * 2);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user