Remove old cudf constructor code (#5194)
This commit is contained in:
parent
87ebfc1315
commit
8cbcc53ccb
@ -39,6 +39,53 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
size_t num_elements;
|
||||
};
|
||||
|
||||
/*!
|
||||
* Please be careful that, in official specification, the only three required fields are
|
||||
* `shape', `version' and `typestr'. Any other is optional, including `data'. But here
|
||||
* we have one additional requirements for input data:
|
||||
*
|
||||
* - `data' field is required, passing in an empty dataset is not accepted, as most (if
|
||||
* not all) of our algorithms don't have test for empty dataset. An error is better
|
||||
* than a crash.
|
||||
*
|
||||
* What if invalid value from dataframe is 0 but I specify missing=NaN in XGBoost? Since
|
||||
* validity mask is ignored, all 0s are preserved in XGBoost.
|
||||
*
|
||||
* FIXME(trivialfis): Put above into document after we have a consistent way for
|
||||
* processing input data.
|
||||
*
|
||||
* Sample input:
|
||||
* [
|
||||
* {
|
||||
* "shape": [
|
||||
* 10
|
||||
* ],
|
||||
* "strides": [
|
||||
* 4
|
||||
* ],
|
||||
* "data": [
|
||||
* 30074864128,
|
||||
* false
|
||||
* ],
|
||||
* "typestr": "<f4",
|
||||
* "version": 1,
|
||||
* "mask": {
|
||||
* "shape": [
|
||||
* 64
|
||||
* ],
|
||||
* "strides": [
|
||||
* 1
|
||||
* ],
|
||||
* "data": [
|
||||
* 30074864640,
|
||||
* false
|
||||
* ],
|
||||
* "typestr": "|i1",
|
||||
* "version": 1
|
||||
* }
|
||||
* }
|
||||
* ]
|
||||
*/
|
||||
class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
public:
|
||||
explicit CudfAdapter(std::string cuda_interfaces_str) {
|
||||
|
||||
@ -56,86 +56,5 @@ const SparsePage& SimpleCSRSource::Value() const {
|
||||
return page_;
|
||||
}
|
||||
|
||||
/*!
|
||||
* Please be careful that, in official specification, the only three required fields are
|
||||
* `shape', `version' and `typestr'. Any other is optional, including `data'. But here
|
||||
* we have one additional requirements for input data:
|
||||
*
|
||||
* - `data' field is required, passing in an empty dataset is not accepted, as most (if
|
||||
* not all) of our algorithms don't have test for empty dataset. An error is better
|
||||
* than a crash.
|
||||
*
|
||||
* Missing value handling:
|
||||
* Missing value is specified:
|
||||
* - Ignore the validity mask from columnar format.
|
||||
* - Remove entries that equals to missing value.
|
||||
* - missing = NaN:
|
||||
* - Remove entries that is NaN
|
||||
* - missing != NaN:
|
||||
* - Check for NaN entries, throw an error if found.
|
||||
* Missing value is not specified:
|
||||
* - Remove entries that is specifed as by validity mask.
|
||||
* - Remove NaN entries.
|
||||
*
|
||||
* What if invalid value from dataframe is 0 but I specify missing=NaN in XGBoost? Since
|
||||
* validity mask is ignored, all 0s are preserved in XGBoost.
|
||||
*
|
||||
* FIXME(trivialfis): Put above into document after we have a consistent way for
|
||||
* processing input data.
|
||||
*
|
||||
* Sample input:
|
||||
* [
|
||||
* {
|
||||
* "shape": [
|
||||
* 10
|
||||
* ],
|
||||
* "strides": [
|
||||
* 4
|
||||
* ],
|
||||
* "data": [
|
||||
* 30074864128,
|
||||
* false
|
||||
* ],
|
||||
* "typestr": "<f4",
|
||||
* "version": 1,
|
||||
* "mask": {
|
||||
* "shape": [
|
||||
* 64
|
||||
* ],
|
||||
* "strides": [
|
||||
* 1
|
||||
* ],
|
||||
* "data": [
|
||||
* 30074864640,
|
||||
* false
|
||||
* ],
|
||||
* "typestr": "|i1",
|
||||
* "version": 1
|
||||
* }
|
||||
* }
|
||||
* ]
|
||||
*/
|
||||
void SimpleCSRSource::CopyFrom(std::string const& cuda_interfaces_str,
|
||||
bool has_missing, float missing) {
|
||||
Json interfaces = Json::Load({cuda_interfaces_str.c_str(),
|
||||
cuda_interfaces_str.size()});
|
||||
std::vector<Json> const& columns = get<Array>(interfaces);
|
||||
size_t n_columns = columns.size();
|
||||
CHECK_GT(n_columns, 0) << "Number of columns must not eqaul to 0.";
|
||||
|
||||
auto const& typestr = get<String const>(columns[0]["typestr"]);
|
||||
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
|
||||
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
|
||||
|
||||
this->FromDeviceColumnar(columns, has_missing, missing);
|
||||
}
|
||||
|
||||
#if !defined(XGBOOST_USE_CUDA)
|
||||
void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
|
||||
bool has_missing, float missing) {
|
||||
LOG(FATAL) << "XGBoost version is not compiled with GPU support";
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
|
||||
@ -1,200 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2019 by XGBoost Contributors
|
||||
*
|
||||
* \file simple_csr_source.cuh
|
||||
* \brief An extension for the simple CSR source in-memory data structure to accept
|
||||
* foreign columnar.
|
||||
*/
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/scan.h>
|
||||
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "simple_csr_source.h"
|
||||
#include "columnar.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/bitfield.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
__global__ void CountValidKernel(Columnar const column,
|
||||
bool has_missing, float missing,
|
||||
int32_t* flag, common::Span<bst_row_t> offsets) {
|
||||
auto const tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
bool const missing_is_nan = common::CheckNAN(missing);
|
||||
|
||||
if (tid >= column.size) {
|
||||
return;
|
||||
}
|
||||
RBitField8 const mask = column.valid;
|
||||
|
||||
if (!has_missing) {
|
||||
if ((mask.Data() == nullptr || mask.Check(tid)) &&
|
||||
!common::CheckNAN(column.GetElement(tid))) {
|
||||
offsets[tid+1] += 1;
|
||||
}
|
||||
} else if (missing_is_nan) {
|
||||
if (!common::CheckNAN(column.GetElement(tid))) {
|
||||
offsets[tid+1] += 1;
|
||||
}
|
||||
} else {
|
||||
if (!common::CloseTo(column.GetElement(tid), missing)) {
|
||||
offsets[tid+1] += 1;
|
||||
}
|
||||
if (common::CheckNAN(column.GetElement(tid))) {
|
||||
*flag = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void AssignValue(T fvalue, int32_t colid,
|
||||
common::Span<bst_row_t> out_offsets, common::Span<Entry> out_data) {
|
||||
auto const tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
int32_t oid = out_offsets[tid];
|
||||
out_data[oid].fvalue = fvalue;
|
||||
out_data[oid].index = colid;
|
||||
out_offsets[tid] += 1;
|
||||
}
|
||||
|
||||
__global__ void CreateCSRKernel(Columnar const column,
|
||||
int32_t colid, bool has_missing, float missing,
|
||||
common::Span<bst_row_t> offsets, common::Span<Entry> out_data) {
|
||||
auto const tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (column.size <= tid) {
|
||||
return;
|
||||
}
|
||||
bool const missing_is_nan = common::CheckNAN(missing);
|
||||
if (!has_missing) {
|
||||
// no missing value is specified
|
||||
if ((column.valid.Data() == nullptr || column.valid.Check(tid)) &&
|
||||
!common::CheckNAN(column.GetElement(tid))) {
|
||||
AssignValue(column.GetElement(tid), colid, offsets, out_data);
|
||||
}
|
||||
} else if (missing_is_nan) {
|
||||
// specified missing value, but it's NaN
|
||||
if (!common::CheckNAN(column.GetElement(tid))) {
|
||||
AssignValue(column.GetElement(tid), colid, offsets, out_data);
|
||||
}
|
||||
} else {
|
||||
// specified missing value, and it's not NaN
|
||||
if (!common::CloseTo(column.GetElement(tid), missing)) {
|
||||
AssignValue(column.GetElement(tid), colid, offsets, out_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CountValid(std::vector<Json> const& j_columns, uint32_t column_id,
|
||||
bool has_missing, float missing,
|
||||
HostDeviceVector<bst_row_t>* out_offset,
|
||||
dh::caching_device_vector<int32_t>* out_d_flag,
|
||||
uint32_t* out_n_rows) {
|
||||
uint32_t constexpr kThreads = 256;
|
||||
auto const& j_column = j_columns[column_id];
|
||||
auto const& column_obj = get<Object const>(j_column);
|
||||
Columnar foreign_column(column_obj);
|
||||
uint32_t const n_rows = foreign_column.size;
|
||||
|
||||
auto ptr = foreign_column.data;
|
||||
int32_t device = dh::CudaGetPointerDevice(ptr);
|
||||
CHECK_NE(device, -1);
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
if (column_id == 0) {
|
||||
out_offset->SetDevice(device);
|
||||
out_offset->Resize(n_rows + 1);
|
||||
}
|
||||
CHECK_EQ(out_offset->DeviceIdx(), device)
|
||||
<< "All columns should use the same device.";
|
||||
CHECK_EQ(out_offset->Size(), n_rows + 1)
|
||||
<< "All columns should have same number of rows.";
|
||||
|
||||
common::Span<bst_row_t> s_offsets = out_offset->DeviceSpan();
|
||||
|
||||
uint32_t const kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
dh::LaunchKernel {kBlocks, kThreads} (
|
||||
CountValidKernel,
|
||||
foreign_column,
|
||||
has_missing, missing,
|
||||
out_d_flag->data().get(), s_offsets);
|
||||
*out_n_rows = n_rows;
|
||||
}
|
||||
|
||||
void CreateCSR(std::vector<Json> const& j_columns, uint32_t column_id, uint32_t n_rows,
|
||||
bool has_missing, float missing,
|
||||
dh::device_vector<bst_row_t>* tmp_offset, common::Span<Entry> s_data) {
|
||||
uint32_t constexpr kThreads = 256;
|
||||
auto const& j_column = j_columns[column_id];
|
||||
auto const& column_obj = get<Object const>(j_column);
|
||||
Columnar foreign_column(column_obj);
|
||||
uint32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
dh::LaunchKernel {kBlocks, kThreads} (
|
||||
CreateCSRKernel,
|
||||
foreign_column, column_id, has_missing, missing,
|
||||
dh::ToSpan(*tmp_offset), s_data);
|
||||
}
|
||||
|
||||
void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
|
||||
bool has_missing, float missing) {
|
||||
auto const n_cols = columns.size();
|
||||
int32_t constexpr kThreads = 256;
|
||||
|
||||
dh::caching_device_vector<int32_t> d_flag;
|
||||
if (!common::CheckNAN(missing)) {
|
||||
d_flag.resize(1);
|
||||
thrust::fill(d_flag.begin(), d_flag.end(), 0);
|
||||
}
|
||||
uint32_t n_rows {0};
|
||||
for (size_t i = 0; i < n_cols; ++i) {
|
||||
CountValid(columns, i, has_missing, missing, &(this->page_.offset), &d_flag,
|
||||
&n_rows);
|
||||
}
|
||||
// don't pay for what you don't use.
|
||||
if (!common::CheckNAN(missing)) {
|
||||
int32_t flag {0};
|
||||
dh::safe_cuda(cudaMemcpy(&flag, d_flag.data().get(), sizeof(int32_t), cudaMemcpyDeviceToHost));
|
||||
CHECK_EQ(flag, 0) << "missing value is specifed but input data contains NaN.";
|
||||
}
|
||||
|
||||
info.num_col_ = n_cols;
|
||||
info.num_row_ = n_rows;
|
||||
|
||||
auto s_offsets = this->page_.offset.DeviceSpan();
|
||||
thrust::device_ptr<bst_row_t> p_offsets(s_offsets.data());
|
||||
CHECK_GE(s_offsets.size(), n_rows + 1);
|
||||
|
||||
thrust::inclusive_scan(p_offsets, p_offsets + n_rows + 1, p_offsets);
|
||||
// Created for building csr matrix, where we need to change index after processing each
|
||||
// column.
|
||||
dh::device_vector<bst_row_t> tmp_offset(this->page_.offset.Size());
|
||||
dh::safe_cuda(cudaMemcpy(tmp_offset.data().get(), s_offsets.data(),
|
||||
s_offsets.size_bytes(), cudaMemcpyDeviceToDevice));
|
||||
|
||||
// We can use null_count from columnar data format, but that will add a non-standard
|
||||
// entry in the array interface, also involves accumulating from all columns. Invoking
|
||||
// one copy seems easier.
|
||||
this->info.num_nonzero_ = tmp_offset.back();
|
||||
|
||||
// Device is obtained and set in `CountValid'
|
||||
int32_t const device = this->page_.offset.DeviceIdx();
|
||||
this->page_.data.SetDevice(device);
|
||||
this->page_.data.Resize(this->info.num_nonzero_);
|
||||
auto s_data = this->page_.data.DeviceSpan();
|
||||
|
||||
int32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
for (size_t i = 0; i < n_cols; ++i) {
|
||||
CreateCSR(columns, i, n_rows, has_missing, missing, &tmp_offset, s_data);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
@ -46,14 +46,6 @@ class SimpleCSRSource : public DataSource<SparsePage> {
|
||||
*/
|
||||
void CopyFrom(DMatrix* src);
|
||||
|
||||
/*!
|
||||
* \brief copy content of data from foreign **GPU** columnar buffer.
|
||||
* \param interfaces_str JSON representation of cuda array interfaces.
|
||||
* \param has_missing Whether did users supply their own missing value.
|
||||
* \param missing The missing value set by users.
|
||||
*/
|
||||
void CopyFrom(std::string const& cuda_interfaces_str, bool has_missing,
|
||||
bst_float missing = std::numeric_limits<float>::quiet_NaN());
|
||||
/*!
|
||||
* \brief Load data from binary stream.
|
||||
* \param fi the pointer to load data from.
|
||||
@ -74,14 +66,6 @@ class SimpleCSRSource : public DataSource<SparsePage> {
|
||||
static const int kMagic = 0xffffab01;
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief copy content of data from foreign GPU columnar buffer.
|
||||
* \param columns JSON representation of array interfaces.
|
||||
* \param missing specifed missing value
|
||||
*/
|
||||
void FromDeviceColumnar(std::vector<Json> const& columns,
|
||||
bool has_missing = false,
|
||||
float missing = std::numeric_limits<float>::quiet_NaN());
|
||||
/*! \brief internal variable, used to support iterator interface */
|
||||
bool at_first_{true};
|
||||
};
|
||||
|
||||
@ -1,380 +0,0 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/json.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <memory>
|
||||
#include "../../../src/common/bitfield.h"
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/data/simple_csr_source.h"
|
||||
#include "../../../src/data/columnar.h"
|
||||
#include "test_columnar.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
TEST(ArrayInterfaceHandler, Error) {
|
||||
constexpr size_t kRows {16};
|
||||
Json column { Object() };
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(nullptr))),
|
||||
Json(Boolean(false))};
|
||||
|
||||
auto const& column_obj = get<Object>(column);
|
||||
// missing version
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
// missing data
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
column["data"] = j_data;
|
||||
// missing typestr
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
column["typestr"] = String("<f4");
|
||||
// nullptr is not valid
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
thrust::device_vector<float> d_data(kRows);
|
||||
j_data = {Json(Integer(reinterpret_cast<Integer::Int>(d_data.data().get()))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
EXPECT_NO_THROW(Columnar c(column_obj));
|
||||
|
||||
std::vector<Json> j_mask_shape {Json(Integer(static_cast<Integer::Int>(kRows - 1)))};
|
||||
column["mask"] = Object();
|
||||
column["mask"]["shape"] = j_mask_shape;
|
||||
column["mask"]["data"] = j_data;
|
||||
column["mask"]["typestr"] = String("<i1");
|
||||
column["mask"]["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
// shape of mask and data doesn't match.
|
||||
EXPECT_THROW(Columnar c(column_obj), dmlc::Error);
|
||||
}
|
||||
|
||||
|
||||
void TestGetElement() {
|
||||
thrust::device_vector<float> data;
|
||||
auto j_column = GenerateDenseColumn("<f4", 3, &data);
|
||||
auto const& column_obj = get<Object const>(j_column);
|
||||
Columnar foreign_column(column_obj);
|
||||
|
||||
EXPECT_NO_THROW({
|
||||
dh::LaunchN(0, 1, [=] __device__(size_t idx) {
|
||||
KERNEL_CHECK(foreign_column.GetElement(0) == 0.0f);
|
||||
KERNEL_CHECK(foreign_column.GetElement(1) == 2.0f);
|
||||
KERNEL_CHECK(foreign_column.GetElement(2) == 4.0f);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
TEST(Columnar, GetElement) { TestGetElement(); }
|
||||
|
||||
void TestDenseColumn(std::unique_ptr<data::SimpleCSRSource> const& source,
|
||||
size_t n_rows, size_t n_cols) {
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
|
||||
for (size_t i = 0; i < n_rows; i++) {
|
||||
auto const idx = i * n_cols;
|
||||
auto const e_0 = data.at(idx);
|
||||
ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx;
|
||||
ASSERT_EQ(e_0.index, 0); // feature 0
|
||||
|
||||
auto e_1 = data.at(idx+1);
|
||||
ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps);
|
||||
ASSERT_EQ(e_1.index, 1); // feature 1
|
||||
}
|
||||
ASSERT_EQ(offset.back(), n_rows * n_cols);
|
||||
for (size_t i = 0; i < n_rows + 1; ++i) {
|
||||
ASSERT_EQ(offset[i], i * n_cols);
|
||||
}
|
||||
ASSERT_EQ(source->info.num_row_, n_rows);
|
||||
ASSERT_EQ(source->info.num_col_, n_cols);
|
||||
}
|
||||
|
||||
TEST(SimpleCSRSource, FromColumnarDense) {
|
||||
constexpr size_t kRows {16};
|
||||
constexpr size_t kCols {2};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<float> d_data_0(kRows);
|
||||
thrust::device_vector<int32_t> d_data_1(kRows);
|
||||
columns.emplace_back(GenerateDenseColumn<float>("<f4", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<int32_t>("<i4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr {columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
// no missing value
|
||||
{
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
TestDenseColumn(source, kRows, kCols);
|
||||
}
|
||||
|
||||
// with missing value specified
|
||||
{
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), true, 4.0);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
ASSERT_EQ(data.size(), kRows * kCols - 2);
|
||||
ASSERT_NEAR(data[4].fvalue, 6.0, kRtEps); // kCols * 2
|
||||
ASSERT_EQ(offset.back(), 30);
|
||||
for (size_t i = 3; i < kRows + 1; ++i) {
|
||||
ASSERT_EQ(offset[i], (i - 1) * 2);
|
||||
}
|
||||
ASSERT_EQ(source->info.num_row_, kRows);
|
||||
ASSERT_EQ(source->info.num_col_, kCols);
|
||||
}
|
||||
|
||||
{
|
||||
// no missing value, but has NaN
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
d_data_0[3] = std::numeric_limits<float>::quiet_NaN();
|
||||
ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
ASSERT_EQ(data.size(), kRows * kCols - 1);
|
||||
ASSERT_NEAR(data[7].fvalue, 8.0, kRtEps);
|
||||
ASSERT_EQ(source->info.num_row_, kRows);
|
||||
ASSERT_EQ(source->info.num_col_, kCols);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleCSRSource, FromColumnarWithEmptyRows) {
|
||||
constexpr size_t kRows = 102;
|
||||
constexpr size_t kCols = 24;
|
||||
|
||||
std::vector<Json> v_columns (kCols);
|
||||
std::vector<dh::device_vector<float>> columns_data(kCols);
|
||||
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
|
||||
|
||||
RBitField8::value_type constexpr kUCOne = 1;
|
||||
|
||||
for (size_t i = 0; i < kCols; ++i) {
|
||||
auto& col = v_columns[i];
|
||||
col = Object();
|
||||
auto& data = columns_data[i];
|
||||
data.resize(kRows);
|
||||
thrust::sequence(data.begin(), data.end(), 0);
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
|
||||
ASSERT_EQ(data.size(), kRows);
|
||||
|
||||
auto p_d_data = raw_pointer_cast(data.data());
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
col["data"] = j_data;
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
col["shape"] = Array(j_shape);
|
||||
col["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
col["typestr"] = String("<f4");
|
||||
|
||||
// Construct the mask object.
|
||||
col["mask"] = Object();
|
||||
auto& j_mask = col["mask"];
|
||||
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
auto& mask_storage = column_bitfields[i];
|
||||
mask_storage.resize(16); // 16 bytes
|
||||
|
||||
mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing
|
||||
mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing
|
||||
size_t last_ind = 12;
|
||||
mask_storage[last_ind] = ~(kUCOne << 5);
|
||||
std::set<size_t> missing_row_index {0, 1, last_ind};
|
||||
|
||||
for (size_t i = 0; i < mask_storage.size(); ++i) {
|
||||
if (missing_row_index.find(i) == missing_row_index.cend()) {
|
||||
// all other rows are valid
|
||||
mask_storage[i] = ~0;
|
||||
}
|
||||
}
|
||||
|
||||
j_mask["data"] = std::vector<Json>{
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(mask_storage.data().get()))),
|
||||
Json(Boolean(false))};
|
||||
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
||||
j_mask["typestr"] = String("|i1");
|
||||
}
|
||||
|
||||
Json column_arr {Array(v_columns)};
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
|
||||
ASSERT_EQ(offset.size(), kRows + 1);
|
||||
for (size_t i = 1; i < offset.size(); ++i) {
|
||||
for (size_t j = offset[i-1]; j < offset[i]; ++j) {
|
||||
ASSERT_EQ(data[j].index, j % kCols);
|
||||
ASSERT_NEAR(data[j].fvalue, i - 1, kRtEps);
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(source->info.num_row_, kRows);
|
||||
}
|
||||
|
||||
TEST(SimpleCSRSource, FromColumnarSparse) {
|
||||
constexpr size_t kRows = 32;
|
||||
constexpr size_t kCols = 2;
|
||||
RBitField8::value_type constexpr kUCOne = 1;
|
||||
|
||||
std::vector<dh::device_vector<float>> columns_data(kCols);
|
||||
std::vector<dh::device_vector<RBitField8::value_type>> column_bitfields(kCols);
|
||||
|
||||
{
|
||||
// column 0
|
||||
auto& mask = column_bitfields[0];
|
||||
mask.resize(8);
|
||||
|
||||
for (size_t j = 0; j < mask.size(); ++j) {
|
||||
mask[j] = ~0;
|
||||
}
|
||||
// the 2^th entry of first column is invalid
|
||||
// [0 0 0 0 0 1 0 0]
|
||||
mask[0] = ~(kUCOne << 2);
|
||||
}
|
||||
{
|
||||
// column 1
|
||||
auto& mask = column_bitfields[1];
|
||||
mask.resize(8);
|
||||
|
||||
for (size_t j = 0; j < mask.size(); ++j) {
|
||||
mask[j] = ~0;
|
||||
}
|
||||
// the 19^th entry of second column is invalid
|
||||
// [~0~], [~0~], [0 0 0 0 1 0 0 0]
|
||||
mask[2] = ~(kUCOne << 3);
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
columns_data[c].resize(kRows);
|
||||
thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0);
|
||||
}
|
||||
|
||||
std::vector<Json> j_columns(kCols);
|
||||
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
auto& column = j_columns[c];
|
||||
column = Object();
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String("<f4");
|
||||
auto p_d_data = raw_pointer_cast(columns_data[c].data());
|
||||
std::vector<Json> j_data {
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
|
||||
Json(Boolean(false))};
|
||||
column["data"] = j_data;
|
||||
std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
|
||||
column["shape"] = Array(j_shape);
|
||||
column["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
column["typestr"] = String("<f4");
|
||||
|
||||
column["mask"] = Object();
|
||||
auto& j_mask = column["mask"];
|
||||
j_mask["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
j_mask["data"] = std::vector<Json>{
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
|
||||
Json(Boolean(false))};
|
||||
j_mask["shape"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(kRows)))});
|
||||
j_mask["typestr"] = String("|i1");
|
||||
}
|
||||
|
||||
Json column_arr {Array(j_columns)};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
{
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
|
||||
ASSERT_EQ(offset.size(), kRows + 1);
|
||||
ASSERT_EQ(data[4].index, 1);
|
||||
ASSERT_EQ(data[4].fvalue, 2);
|
||||
ASSERT_EQ(data[37].index, 0);
|
||||
ASSERT_EQ(data[37].fvalue, 19);
|
||||
}
|
||||
|
||||
{
|
||||
// with missing value
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), true, /*missing=*/2.0);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
ASSERT_NE(data[4].fvalue, 2.0);
|
||||
}
|
||||
|
||||
{
|
||||
// no missing value, but has NaN
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
columns_data[0][4] = std::numeric_limits<float>::quiet_NaN(); // 0^th column 4^th row
|
||||
ASSERT_TRUE(std::isnan(columns_data[0][4]));
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
auto const& offset = source->page_.offset.HostVector();
|
||||
// Two invalid entries and one NaN, in CSC
|
||||
// 0^th column: 0, 1, 4, 5, 6, ..., kRows
|
||||
// 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows
|
||||
// Turning it into CSR:
|
||||
// | 0, 0 | 1, 1 | 2 | 3, 3 | 4 | ...
|
||||
ASSERT_EQ(data.size(), kRows * kCols - 3);
|
||||
ASSERT_EQ(data[4].index, 1); // from 1^th column
|
||||
ASSERT_EQ(data[5].fvalue, 3.0);
|
||||
ASSERT_EQ(data[7].index, 1); // from 1^th column
|
||||
ASSERT_EQ(data[7].fvalue, 4.0);
|
||||
|
||||
ASSERT_EQ(data[offset[2]].fvalue, 2.0);
|
||||
ASSERT_EQ(data[offset[4]].fvalue, 4.0);
|
||||
}
|
||||
|
||||
{
|
||||
// with NaN as missing value
|
||||
// NaN is already set up by above test
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), true,
|
||||
/*missing=*/std::numeric_limits<float>::quiet_NaN());
|
||||
|
||||
auto const& data = source->page_.data.HostVector();
|
||||
ASSERT_EQ(data.size(), kRows * kCols - 1);
|
||||
ASSERT_EQ(data[8].fvalue, 4.0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleCSRSource, Types) {
|
||||
// Test with different types of different size
|
||||
constexpr size_t kRows {16};
|
||||
constexpr size_t kCols {2};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<double> d_data_0(kRows);
|
||||
thrust::device_vector<uint32_t> d_data_1(kRows);
|
||||
|
||||
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr {columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
std::unique_ptr<data::SimpleCSRSource> source (new data::SimpleCSRSource());
|
||||
source->CopyFrom(str.c_str(), false);
|
||||
TestDenseColumn(source, kRows, kCols);
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
Loading…
x
Reference in New Issue
Block a user