External data adapters (#5044)
* Use external data adapters as lightweight intermediate layer between external data and DMatrix
This commit is contained in:
parent
f2277e7106
commit
e3c34c79be
@ -465,6 +465,20 @@ class DMatrix {
|
||||
*/
|
||||
static DMatrix* Create(std::unique_ptr<DataSource<SparsePage>>&& source,
|
||||
const std::string& cache_prefix = "");
|
||||
|
||||
/**
|
||||
* \brief Creates a new DMatrix from an external data adapter.
|
||||
*
|
||||
* \tparam AdapterT Type of the adapter.
|
||||
* \param adapter View onto an external data.
|
||||
* \param missing Values to count as missing.
|
||||
* \param nthread Number of threads for construction.
|
||||
*
|
||||
* \return a Created DMatrix.
|
||||
*/
|
||||
template <typename AdapterT>
|
||||
static DMatrix* Create(AdapterT* adapter, float missing, int nthread);
|
||||
|
||||
/*!
|
||||
* \brief Create a DMatrix by loading data from parser.
|
||||
* Parser can later be deleted after the DMatrix i created.
|
||||
|
||||
@ -18,9 +18,8 @@
|
||||
|
||||
#include "c_api_error.h"
|
||||
#include "../data/simple_csr_source.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/io.h"
|
||||
#include "../common/group_data.h"
|
||||
#include "../data/adapter.h"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
@ -218,37 +217,9 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
||||
size_t nelem,
|
||||
size_t num_col,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
auto& data_vec = mat.page_.data.HostVector();
|
||||
offset_vec.reserve(nindptr);
|
||||
data_vec.reserve(nelem);
|
||||
offset_vec.resize(1);
|
||||
offset_vec[0] = 0;
|
||||
size_t num_column = 0;
|
||||
for (size_t i = 1; i < nindptr; ++i) {
|
||||
for (size_t j = indptr[i - 1]; j < indptr[i]; ++j) {
|
||||
if (!common::CheckNAN(data[j])) {
|
||||
// automatically skip nan.
|
||||
data_vec.emplace_back(Entry(indices[j], data[j]));
|
||||
num_column = std::max(num_column, static_cast<size_t>(indices[j] + 1));
|
||||
}
|
||||
}
|
||||
offset_vec.push_back(mat.page_.data.Size());
|
||||
}
|
||||
|
||||
mat.info.num_col_ = num_column;
|
||||
if (num_col > 0) {
|
||||
CHECK_LE(mat.info.num_col_, num_col)
|
||||
<< "num_col=" << num_col << " vs " << mat.info.num_col_;
|
||||
mat.info.num_col_ = num_col;
|
||||
}
|
||||
mat.info.num_row_ = nindptr - 1;
|
||||
mat.info.num_nonzero_ = mat.page_.data.Size();
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
|
||||
API_END();
|
||||
}
|
||||
|
||||
@ -259,361 +230,41 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||
size_t nelem,
|
||||
size_t num_row,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
// FIXME: User should be able to control number of threads
|
||||
const int nthread = omp_get_max_threads();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
auto& data_vec = mat.page_.data.HostVector();
|
||||
common::ParallelGroupBuilder<
|
||||
Entry, std::remove_reference<decltype(offset_vec)>::type::value_type>
|
||||
builder(&offset_vec, &data_vec);
|
||||
builder.InitBudget(0, nthread);
|
||||
size_t ncol = nindptr - 1; // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
if (!common::CheckNAN(data[j])) {
|
||||
builder.AddBudget(indices[j], tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
if (!common::CheckNAN(data[j])) {
|
||||
builder.Push(indices[j],
|
||||
Entry(static_cast<bst_uint>(i), data[j]),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
mat.info.num_row_ = mat.page_.offset.Size() - 1;
|
||||
if (num_row > 0) {
|
||||
CHECK_LE(mat.info.num_row_, num_row);
|
||||
// provision for empty rows at the bottom of matrix
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
for (uint64_t i = mat.info.num_row_; i < static_cast<uint64_t>(num_row); ++i) {
|
||||
offset_vec.push_back(offset_vec.back());
|
||||
}
|
||||
mat.info.num_row_ = num_row;
|
||||
CHECK_EQ(mat.info.num_row_, offset_vec.size() - 1); // sanity check
|
||||
}
|
||||
mat.info.num_col_ = ncol;
|
||||
mat.info.num_nonzero_ = nelem;
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,
|
||||
xgboost::bst_ulong nrow,
|
||||
xgboost::bst_ulong ncol,
|
||||
bst_float missing,
|
||||
xgboost::bst_ulong ncol, bst_float missing,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
auto& data_vec = mat.page_.data.HostVector();
|
||||
offset_vec.resize(1+nrow);
|
||||
bool nan_missing = common::CheckNAN(missing);
|
||||
mat.info.num_row_ = nrow;
|
||||
mat.info.num_col_ = ncol;
|
||||
const bst_float* data0 = data;
|
||||
|
||||
// count elements for sizing data
|
||||
data = data0;
|
||||
for (xgboost::bst_ulong i = 0; i < nrow; ++i, data += ncol) {
|
||||
xgboost::bst_ulong nelem = 0;
|
||||
for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (common::CheckNAN(data[j])) {
|
||||
CHECK(nan_missing)
|
||||
<< "There are NAN in the matrix, however, you did not set missing=NAN";
|
||||
} else {
|
||||
if (nan_missing || data[j] != missing) {
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
}
|
||||
offset_vec[i+1] = offset_vec[i] + nelem;
|
||||
}
|
||||
data_vec.resize(mat.page_.data.Size() + offset_vec.back());
|
||||
|
||||
data = data0;
|
||||
for (xgboost::bst_ulong i = 0; i < nrow; ++i, data += ncol) {
|
||||
xgboost::bst_ulong matj = 0;
|
||||
for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (common::CheckNAN(data[j])) {
|
||||
} else {
|
||||
if (nan_missing || data[j] != missing) {
|
||||
data_vec[offset_vec[i] + matj] = Entry(j, data[j]);
|
||||
++matj;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mat.info.num_nonzero_ = mat.page_.data.Size();
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
data::DenseAdapter adapter(data, nrow, nrow * ncol, ncol);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, 1));
|
||||
API_END();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void PrefixSum(T *x, size_t N) {
|
||||
std::vector<T> suma;
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int ithread = omp_get_thread_num();
|
||||
const int nthreads = omp_get_num_threads();
|
||||
#pragma omp single
|
||||
{
|
||||
suma.resize(nthreads+1);
|
||||
suma[0] = 0;
|
||||
}
|
||||
T sum = 0;
|
||||
T offset = 0;
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < N; i++) {
|
||||
sum += x[i];
|
||||
x[i] = sum;
|
||||
}
|
||||
suma[ithread+1] = sum;
|
||||
#pragma omp barrier
|
||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ithread+1); i++) {
|
||||
offset += suma[i];
|
||||
}
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < N; i++) {
|
||||
x[i] += offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data, // NOLINT
|
||||
xgboost::bst_ulong nrow,
|
||||
xgboost::bst_ulong ncol,
|
||||
bst_float missing, DMatrixHandle* out,
|
||||
int nthread) {
|
||||
// avoid openmp unless enough data to be worth it to avoid overhead costs
|
||||
if (nrow*ncol <= 10000*50) {
|
||||
return(XGDMatrixCreateFromMat(data, nrow, ncol, missing, out));
|
||||
}
|
||||
|
||||
API_BEGIN();
|
||||
const int nthreadmax = std::max(omp_get_num_procs() / 2 - 1, 1);
|
||||
// const int nthreadmax = omp_get_max_threads();
|
||||
if (nthread <= 0) nthread=nthreadmax;
|
||||
int nthread_orig = omp_get_max_threads();
|
||||
omp_set_num_threads(nthread);
|
||||
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
auto& data_vec = mat.page_.data.HostVector();
|
||||
offset_vec.resize(1+nrow);
|
||||
mat.info.num_row_ = nrow;
|
||||
mat.info.num_col_ = ncol;
|
||||
|
||||
// Check for errors in missing elements
|
||||
// Count elements per row (to avoid otherwise need to copy)
|
||||
bool nan_missing = common::CheckNAN(missing);
|
||||
std::vector<int> badnan;
|
||||
badnan.resize(nthread, 0);
|
||||
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
int ithread = omp_get_thread_num();
|
||||
|
||||
// Count elements per row
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < nrow; ++i) {
|
||||
xgboost::bst_ulong nelem = 0;
|
||||
for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (common::CheckNAN(data[ncol*i + j]) && !nan_missing) {
|
||||
badnan[ithread] = 1;
|
||||
} else if (common::CheckNAN(data[ncol * i + j])) {
|
||||
} else if (nan_missing || data[ncol * i + j] != missing) {
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
offset_vec[i+1] = nelem;
|
||||
}
|
||||
}
|
||||
// Inform about any NaNs and resize data matrix
|
||||
for (int i = 0; i < nthread; i++) {
|
||||
CHECK(!badnan[i]) << "There are NAN in the matrix, however, you did not set missing=NAN";
|
||||
}
|
||||
|
||||
// do cumulative sum (to avoid otherwise need to copy)
|
||||
PrefixSum(&offset_vec[0], offset_vec.size());
|
||||
data_vec.resize(mat.page_.data.Size() + offset_vec.back());
|
||||
|
||||
// Fill data matrix (now that know size, no need for slow push_back())
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < nrow; ++i) {
|
||||
xgboost::bst_ulong matj = 0;
|
||||
for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (common::CheckNAN(data[ncol * i + j])) {
|
||||
} else if (nan_missing || data[ncol * i + j] != missing) {
|
||||
data_vec[offset_vec[i] + matj] =
|
||||
Entry(j, data[ncol * i + j]);
|
||||
++matj;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// restore omp state
|
||||
omp_set_num_threads(nthread_orig);
|
||||
|
||||
mat.info.num_nonzero_ = mat.page_.data.Size();
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
data::DenseAdapter adapter(data, nrow, nrow * ncol, ncol);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
|
||||
API_END();
|
||||
}
|
||||
|
||||
enum class DTType : uint8_t {
|
||||
kFloat32 = 0,
|
||||
kFloat64 = 1,
|
||||
kBool8 = 2,
|
||||
kInt32 = 3,
|
||||
kInt8 = 4,
|
||||
kInt16 = 5,
|
||||
kInt64 = 6,
|
||||
kUnknown = 7
|
||||
};
|
||||
|
||||
DTType DTGetType(std::string type_string) {
|
||||
if (type_string == "float32") {
|
||||
return DTType::kFloat32;
|
||||
} else if (type_string == "float64") {
|
||||
return DTType::kFloat64;
|
||||
} else if (type_string == "bool8") {
|
||||
return DTType::kBool8;
|
||||
} else if (type_string == "int32") {
|
||||
return DTType::kInt32;
|
||||
} else if (type_string == "int8") {
|
||||
return DTType::kInt8;
|
||||
} else if (type_string == "int16") {
|
||||
return DTType::kInt16;
|
||||
} else if (type_string == "int64") {
|
||||
return DTType::kInt64;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown data table type.";
|
||||
return DTType::kUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
float DTGetValue(void* column, DTType dt_type, size_t ridx) {
|
||||
float missing = std::numeric_limits<float>::quiet_NaN();
|
||||
switch (dt_type) {
|
||||
case DTType::kFloat32: {
|
||||
float val = reinterpret_cast<float*>(column)[ridx];
|
||||
return std::isfinite(val) ? val : missing;
|
||||
}
|
||||
case DTType::kFloat64: {
|
||||
double val = reinterpret_cast<double*>(column)[ridx];
|
||||
return std::isfinite(val) ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kBool8: {
|
||||
bool val = reinterpret_cast<bool*>(column)[ridx];
|
||||
return static_cast<float>(val);
|
||||
}
|
||||
case DTType::kInt32: {
|
||||
int32_t val = reinterpret_cast<int32_t*>(column)[ridx];
|
||||
return val != (-2147483647 - 1) ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt8: {
|
||||
int8_t val = reinterpret_cast<int8_t*>(column)[ridx];
|
||||
return val != -128 ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt16: {
|
||||
int16_t val = reinterpret_cast<int16_t*>(column)[ridx];
|
||||
return val != -32768 ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt64: {
|
||||
int64_t val = reinterpret_cast<int64_t*>(column)[ridx];
|
||||
return val != -9223372036854775807 - 1 ? static_cast<float>(val)
|
||||
: missing;
|
||||
}
|
||||
default: {
|
||||
LOG(FATAL) << "Unknown data table type.";
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
|
||||
xgboost::bst_ulong nrow,
|
||||
xgboost::bst_ulong ncol, DMatrixHandle* out,
|
||||
int nthread) {
|
||||
// avoid openmp unless enough data to be worth it to avoid overhead costs
|
||||
if (nrow * ncol <= 10000 * 50) {
|
||||
nthread = 1;
|
||||
}
|
||||
|
||||
API_BEGIN();
|
||||
const int nthreadmax = std::max(omp_get_num_procs() / 2 - 1, 1);
|
||||
if (nthread <= 0) nthread = nthreadmax;
|
||||
int nthread_orig = omp_get_max_threads();
|
||||
omp_set_num_threads(nthread);
|
||||
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
mat.page_.offset.Resize(1 + nrow);
|
||||
mat.info.num_row_ = nrow;
|
||||
mat.info.num_col_ = ncol;
|
||||
|
||||
auto& page_offset = mat.page_.offset.HostVector();
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
// Count elements per row, column by column
|
||||
for (auto j = 0u; j < ncol; ++j) {
|
||||
DTType dtype = DTGetType(feature_stypes[j]);
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < nrow; ++i) {
|
||||
float val = DTGetValue(data[j], dtype, i);
|
||||
if (!std::isnan(val)) {
|
||||
page_offset[i + 1]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// do cumulative sum (to avoid otherwise need to copy)
|
||||
PrefixSum(&page_offset[0], page_offset.size());
|
||||
|
||||
mat.page_.data.Resize(mat.page_.data.Size() + page_offset.back());
|
||||
|
||||
auto& page_data = mat.page_.data.HostVector();
|
||||
|
||||
// Fill data matrix (now that know size, no need for slow push_back())
|
||||
std::vector<size_t> position(nrow);
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
|
||||
DTType dtype = DTGetType(feature_stypes[j]);
|
||||
#pragma omp for schedule(static)
|
||||
for (omp_ulong i = 0; i < nrow; ++i) {
|
||||
float val = DTGetValue(data[j], dtype, i);
|
||||
if (!std::isnan(val)) {
|
||||
page_data[page_offset[i] + position[i]] = Entry(j, val);
|
||||
position[i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// restore omp state
|
||||
omp_set_num_threads(nthread_orig);
|
||||
|
||||
mat.info.num_nonzero_ = mat.page_.data.Size();
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||
data::DataTableAdapter adapter(data, feature_stypes, nrow, ncol);
|
||||
*out = new std::shared_ptr<DMatrix>(
|
||||
DMatrix::Create(&adapter, std::nan(""), nthread));
|
||||
API_END();
|
||||
}
|
||||
|
||||
|
||||
@ -69,25 +69,26 @@ struct ParallelGroupBuilder {
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage() {
|
||||
// set rptr to correct size
|
||||
SizeType rptr_fill_value = rptr_.empty() ? 0 : rptr_.back();
|
||||
for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
|
||||
if (rptr_.size() <= thread_rptr_[tid].size()) {
|
||||
rptr_.resize(thread_rptr_[tid].size() + 1); // key + 1
|
||||
rptr_.resize(thread_rptr_[tid].size() + 1, rptr_fill_value); // key + 1
|
||||
}
|
||||
}
|
||||
// initialize rptr to be beginning of each segment
|
||||
std::size_t start = 0;
|
||||
std::size_t count = 0;
|
||||
for (std::size_t i = 0; i + 1 < rptr_.size(); ++i) {
|
||||
for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
|
||||
std::vector<SizeType> &trptr = thread_rptr_[tid];
|
||||
if (i < trptr.size()) { // i^th row is assigned for this thread
|
||||
std::size_t ncnt = trptr[i]; // how many entries in this row
|
||||
trptr[i] = start;
|
||||
start += ncnt;
|
||||
std::size_t thread_count = trptr[i]; // how many entries in this row
|
||||
trptr[i] = count + rptr_.back();
|
||||
count += thread_count;
|
||||
}
|
||||
}
|
||||
rptr_[i + 1] = start; // pointer accumulated from all thread
|
||||
rptr_[i + 1] += count; // pointer accumulated from all thread
|
||||
}
|
||||
data_.resize(start);
|
||||
data_.resize(rptr_.back());
|
||||
}
|
||||
/*!
|
||||
* \brief step 4: add data to the allocated space,
|
||||
|
||||
488
src/data/adapter.h
Normal file
488
src/data/adapter.h
Normal file
@ -0,0 +1,488 @@
|
||||
/*!
|
||||
* Copyright (c) 2019 by Contributors
|
||||
* \file adapter.h
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_ADAPTER_H_
|
||||
#define XGBOOST_DATA_ADAPTER_H_
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
/** External data formats should implement an adapter as below. The
|
||||
* adapter provides a uniform access to data outside xgboost, allowing
|
||||
* construction of DMatrix objects from a range of sources without duplicating
|
||||
* code.
|
||||
*
|
||||
* The adapter object is an iterator that returns batches of data. Each batch
|
||||
* contains a number of "lines". A line represents a set of elements from a
|
||||
* sparse input matrix, normally a row in the case of a CSR matrix or a column
|
||||
* for a CSC matrix. Typically in sparse matrix formats we can efficiently
|
||||
* access subsets of elements at a time, but cannot efficiently lookups elements
|
||||
* by random access, hence the "line" abstraction, allowing the sparse matrix to
|
||||
* return subsets of elements efficiently. Individual elements are described by
|
||||
* a COO tuple (row index, column index, value).
|
||||
*
|
||||
* This abstraction allows us to read through different sparse matrix formats
|
||||
* using the same interface. In particular we can write a DMatrix constructor
|
||||
* that uses the same code to construct itself from a CSR matrix, CSC matrix,
|
||||
* dense matrix, csv, libsvm file, or potentially other formats. To see why this
|
||||
* is necessary, imagine we have 5 external matrix formats and 5 internal
|
||||
* DMatrix types where each DMatrix needs a custom constructor for each possible
|
||||
* input. The number of constructors is 5*5=25. Using an abstraction over the
|
||||
* input data types the number of constructors is reduced to 5, as each DMatrix
|
||||
* is oblivious to the external data format. Adding a new input source is simply
|
||||
* a case of implementing an adapter.
|
||||
*
|
||||
* Most of the below adapters do not need more than one batch as the data
|
||||
* originates from an in memory source. The file adapter does require batches to
|
||||
* avoid loading the entire file in memory.
|
||||
*
|
||||
* An important detail is empty row/column handling. Files loaded from disk do
|
||||
* not provide meta information about the number of rows/columns to expect, this
|
||||
* needs to be inferred during construction. Other sparse formats may specify a
|
||||
* number of rows/columns, but we can encounter entirely sparse rows or columns,
|
||||
* leading to disagreement between the inferred number and the meta-info
|
||||
* provided. To resolve this, adapters have methods specifying the number of
|
||||
* rows/columns expected, these methods may return zero where these values must
|
||||
* be inferred from data. A constructed DMatrix should agree with the input
|
||||
* source on numbers of rows/columns, appending empty rows if necessary.
|
||||
* */
|
||||
|
||||
/** \brief An adapter can return this value for number of rows or columns
|
||||
* indicating that this value is currently unknown and should be inferred while
|
||||
* passing over the data. */
|
||||
constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
|
||||
|
||||
struct COOTuple {
|
||||
COOTuple(size_t row_idx, size_t column_idx, float value)
|
||||
: row_idx(row_idx), column_idx(column_idx), value(value) {}
|
||||
|
||||
size_t row_idx{0};
|
||||
size_t column_idx{0};
|
||||
float value{0};
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/**
|
||||
* \brief Simplifies the use of DataIter when there is only one batch.
|
||||
*/
|
||||
template <typename DType>
|
||||
class SingleBatchDataIter : dmlc::DataIter<DType> {
|
||||
public:
|
||||
void BeforeFirst() override { counter = 0; }
|
||||
bool Next() override {
|
||||
if (counter == 0) {
|
||||
counter++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
int counter{0};
|
||||
};
|
||||
|
||||
/** \brief Indicates this data source cannot contain meta-info such as labels,
|
||||
* weights or qid. */
|
||||
class NoMetaInfo {
|
||||
public:
|
||||
const float* Labels() const { return nullptr; }
|
||||
const float* Weights() const { return nullptr; }
|
||||
const uint64_t* Qid() const { return nullptr; }
|
||||
};
|
||||
|
||||
}; // namespace detail
|
||||
|
||||
class CSRAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
class Line {
|
||||
public:
|
||||
Line(size_t row_idx, size_t size, const unsigned* feature_idx,
|
||||
const float* values)
|
||||
: row_idx(row_idx),
|
||||
size(size),
|
||||
feature_idx(feature_idx),
|
||||
values(values) {}
|
||||
|
||||
size_t Size() const { return size; }
|
||||
COOTuple GetElement(size_t idx) const {
|
||||
return COOTuple(row_idx, feature_idx[idx], values[idx]);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t row_idx;
|
||||
size_t size;
|
||||
const unsigned* feature_idx;
|
||||
const float* values;
|
||||
};
|
||||
CSRAdapterBatch(const size_t* row_ptr, const unsigned* feature_idx,
|
||||
const float* values, size_t num_rows, size_t num_elements,
|
||||
size_t num_features)
|
||||
: row_ptr(row_ptr),
|
||||
feature_idx(feature_idx),
|
||||
values(values),
|
||||
num_rows(num_rows),
|
||||
num_elements(num_elements),
|
||||
num_features(num_features) {}
|
||||
const Line GetLine(size_t idx) const {
|
||||
size_t begin_offset = row_ptr[idx];
|
||||
size_t end_offset = row_ptr[idx + 1];
|
||||
return Line(idx, end_offset - begin_offset, &feature_idx[begin_offset],
|
||||
&values[begin_offset]);
|
||||
}
|
||||
size_t Size() const { return num_rows; }
|
||||
|
||||
private:
|
||||
const size_t* row_ptr;
|
||||
const unsigned* feature_idx;
|
||||
const float* values;
|
||||
size_t num_elements;
|
||||
size_t num_rows;
|
||||
size_t num_features;
|
||||
};
|
||||
|
||||
class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
|
||||
public:
|
||||
CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx,
|
||||
const float* values, size_t num_rows, size_t num_elements,
|
||||
size_t num_features)
|
||||
: batch(row_ptr, feature_idx, values, num_rows, num_elements,
|
||||
num_features),
|
||||
num_rows(num_rows),
|
||||
num_columns(num_features) {}
|
||||
const CSRAdapterBatch& Value() const override { return batch; }
|
||||
size_t NumRows() const { return num_rows; }
|
||||
size_t NumColumns() const { return num_columns; }
|
||||
|
||||
private:
|
||||
CSRAdapterBatch batch;
|
||||
size_t num_rows;
|
||||
size_t num_columns;
|
||||
};
|
||||
|
||||
class DenseAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
DenseAdapterBatch(const float* values, size_t num_rows, size_t num_elements,
|
||||
size_t num_features)
|
||||
: num_features(num_features),
|
||||
num_rows(num_rows),
|
||||
num_elements(num_elements),
|
||||
values(values) {}
|
||||
|
||||
private:
|
||||
class Line {
|
||||
public:
|
||||
Line(const float* values, size_t size, size_t row_idx)
|
||||
: row_idx(row_idx), size(size), values(values) {}
|
||||
|
||||
size_t Size() const { return size; }
|
||||
COOTuple GetElement(size_t idx) const {
|
||||
return COOTuple(row_idx, idx, values[idx]);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t row_idx;
|
||||
size_t size;
|
||||
const float* values;
|
||||
};
|
||||
|
||||
public:
|
||||
size_t Size() const { return num_rows; }
|
||||
const Line GetLine(size_t idx) const {
|
||||
return Line(values + idx * num_features, num_features, idx);
|
||||
}
|
||||
|
||||
private:
|
||||
const float* values;
|
||||
size_t num_elements;
|
||||
size_t num_rows;
|
||||
size_t num_features;
|
||||
};
|
||||
|
||||
class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
|
||||
public:
|
||||
DenseAdapter(const float* values, size_t num_rows, size_t num_elements,
|
||||
size_t num_features)
|
||||
: batch(values, num_rows, num_elements, num_features),
|
||||
num_rows(num_rows),
|
||||
num_columns(num_features) {}
|
||||
const DenseAdapterBatch& Value() const override { return batch; }
|
||||
|
||||
size_t NumRows() const { return num_rows; }
|
||||
size_t NumColumns() const { return num_columns; }
|
||||
|
||||
private:
|
||||
DenseAdapterBatch batch;
|
||||
size_t num_rows;
|
||||
size_t num_columns;
|
||||
};
|
||||
|
||||
class CSCAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
CSCAdapterBatch(const size_t* col_ptr, const unsigned* row_idx,
|
||||
const float* values, size_t num_features)
|
||||
: col_ptr(col_ptr),
|
||||
row_idx(row_idx),
|
||||
values(values),
|
||||
num_features(num_features) {}
|
||||
|
||||
private:
|
||||
class Line {
|
||||
public:
|
||||
Line(size_t col_idx, size_t size, const unsigned* row_idx,
|
||||
const float* values)
|
||||
: col_idx(col_idx), size(size), row_idx(row_idx), values(values) {}
|
||||
|
||||
size_t Size() const { return size; }
|
||||
COOTuple GetElement(size_t idx) const {
|
||||
return COOTuple(row_idx[idx], col_idx, values[idx]);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t col_idx;
|
||||
size_t size;
|
||||
const unsigned* row_idx;
|
||||
const float* values;
|
||||
};
|
||||
|
||||
public:
|
||||
size_t Size() const { return num_features; }
|
||||
const Line GetLine(size_t idx) const {
|
||||
size_t begin_offset = col_ptr[idx];
|
||||
size_t end_offset = col_ptr[idx + 1];
|
||||
return Line(idx, end_offset - begin_offset, &row_idx[begin_offset],
|
||||
&values[begin_offset]);
|
||||
}
|
||||
|
||||
private:
|
||||
const size_t* col_ptr;
|
||||
const unsigned* row_idx;
|
||||
const float* values;
|
||||
size_t num_features;
|
||||
};
|
||||
|
||||
class CSCAdapter : public detail::SingleBatchDataIter<CSCAdapterBatch> {
|
||||
public:
|
||||
CSCAdapter(const size_t* col_ptr, const unsigned* row_idx,
|
||||
const float* values, size_t num_features, size_t num_rows)
|
||||
: batch(col_ptr, row_idx, values, num_features),
|
||||
num_rows(num_rows),
|
||||
num_columns(num_features) {}
|
||||
const CSCAdapterBatch& Value() const override { return batch; }
|
||||
|
||||
// JVM package sends 0 as unknown
|
||||
size_t NumRows() const {
|
||||
return num_rows == 0 ? kAdapterUnknownSize : num_rows;
|
||||
}
|
||||
size_t NumColumns() const { return num_columns; }
|
||||
|
||||
private:
|
||||
CSCAdapterBatch batch;
|
||||
size_t num_rows;
|
||||
size_t num_columns;
|
||||
};
|
||||
|
||||
class DataTableAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
DataTableAdapterBatch(void** data, const char** feature_stypes,
|
||||
size_t num_rows, size_t num_features)
|
||||
: data(data),
|
||||
feature_stypes(feature_stypes),
|
||||
num_features(num_features),
|
||||
num_rows(num_rows) {}
|
||||
|
||||
private:
|
||||
enum class DTType : uint8_t {
|
||||
kFloat32 = 0,
|
||||
kFloat64 = 1,
|
||||
kBool8 = 2,
|
||||
kInt32 = 3,
|
||||
kInt8 = 4,
|
||||
kInt16 = 5,
|
||||
kInt64 = 6,
|
||||
kUnknown = 7
|
||||
};
|
||||
|
||||
DTType DTGetType(std::string type_string) const {
|
||||
if (type_string == "float32") {
|
||||
return DTType::kFloat32;
|
||||
} else if (type_string == "float64") {
|
||||
return DTType::kFloat64;
|
||||
} else if (type_string == "bool8") {
|
||||
return DTType::kBool8;
|
||||
} else if (type_string == "int32") {
|
||||
return DTType::kInt32;
|
||||
} else if (type_string == "int8") {
|
||||
return DTType::kInt8;
|
||||
} else if (type_string == "int16") {
|
||||
return DTType::kInt16;
|
||||
} else if (type_string == "int64") {
|
||||
return DTType::kInt64;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown data table type.";
|
||||
return DTType::kUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
class Line {
|
||||
float DTGetValue(const void* column, DTType dt_type, size_t ridx) const {
|
||||
float missing = std::numeric_limits<float>::quiet_NaN();
|
||||
switch (dt_type) {
|
||||
case DTType::kFloat32: {
|
||||
float val = reinterpret_cast<const float*>(column)[ridx];
|
||||
return std::isfinite(val) ? val : missing;
|
||||
}
|
||||
case DTType::kFloat64: {
|
||||
double val = reinterpret_cast<const double*>(column)[ridx];
|
||||
return std::isfinite(val) ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kBool8: {
|
||||
bool val = reinterpret_cast<const bool*>(column)[ridx];
|
||||
return static_cast<float>(val);
|
||||
}
|
||||
case DTType::kInt32: {
|
||||
int32_t val = reinterpret_cast<const int32_t*>(column)[ridx];
|
||||
return val != (-2147483647 - 1) ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt8: {
|
||||
int8_t val = reinterpret_cast<const int8_t*>(column)[ridx];
|
||||
return val != -128 ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt16: {
|
||||
int16_t val = reinterpret_cast<const int16_t*>(column)[ridx];
|
||||
return val != -32768 ? static_cast<float>(val) : missing;
|
||||
}
|
||||
case DTType::kInt64: {
|
||||
int64_t val = reinterpret_cast<const int64_t*>(column)[ridx];
|
||||
return val != -9223372036854775807 - 1 ? static_cast<float>(val)
|
||||
: missing;
|
||||
}
|
||||
default: {
|
||||
LOG(FATAL) << "Unknown data table type.";
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
Line(DTType type, size_t size, size_t column_idx, const void* column)
|
||||
: type(type), size(size), column_idx(column_idx), column(column) {}
|
||||
|
||||
size_t Size() const { return size; }
|
||||
COOTuple GetElement(size_t idx) const {
|
||||
return COOTuple(idx, column_idx, DTGetValue(column, type, idx));
|
||||
}
|
||||
|
||||
private:
|
||||
DTType type;
|
||||
size_t size;
|
||||
size_t column_idx;
|
||||
const void* column;
|
||||
};
|
||||
|
||||
public:
|
||||
size_t Size() const { return num_features; }
|
||||
const Line GetLine(size_t idx) const {
|
||||
return Line(DTGetType(feature_stypes[idx]), num_rows, idx, data[idx]);
|
||||
}
|
||||
|
||||
private:
|
||||
void** data;
|
||||
const char** feature_stypes;
|
||||
size_t num_features;
|
||||
size_t num_rows;
|
||||
};
|
||||
|
||||
class DataTableAdapter
|
||||
: public detail::SingleBatchDataIter<DataTableAdapterBatch> {
|
||||
public:
|
||||
DataTableAdapter(void** data, const char** feature_stypes, size_t num_rows,
|
||||
size_t num_features)
|
||||
: batch(data, feature_stypes, num_rows, num_features),
|
||||
num_rows(num_rows),
|
||||
num_columns(num_features) {}
|
||||
const DataTableAdapterBatch& Value() const override { return batch; }
|
||||
size_t NumRows() const { return num_rows; }
|
||||
size_t NumColumns() const { return num_columns; }
|
||||
|
||||
private:
|
||||
DataTableAdapterBatch batch;
|
||||
size_t num_rows;
|
||||
size_t num_columns;
|
||||
};
|
||||
|
||||
class FileAdapterBatch {
|
||||
public:
|
||||
class Line {
|
||||
public:
|
||||
Line(size_t row_idx, const uint32_t* feature_idx, const float* value,
|
||||
size_t size)
|
||||
: row_idx(row_idx),
|
||||
feature_idx(feature_idx),
|
||||
value(value),
|
||||
size(size) {}
|
||||
|
||||
size_t Size() { return size; }
|
||||
COOTuple GetElement(size_t idx) {
|
||||
float fvalue = value == nullptr ? 1.0f : value[idx];
|
||||
return COOTuple(row_idx, feature_idx[idx], fvalue);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t row_idx;
|
||||
const uint32_t* feature_idx;
|
||||
const float* value;
|
||||
size_t size;
|
||||
};
|
||||
FileAdapterBatch(const dmlc::RowBlock<uint32_t>* block, size_t row_offset)
|
||||
: block(block), row_offset(row_offset) {}
|
||||
Line GetLine(size_t idx) const {
|
||||
auto begin = block->offset[idx];
|
||||
auto end = block->offset[idx + 1];
|
||||
return Line(idx + row_offset, &block->index[begin], &block->value[begin],
|
||||
end - begin);
|
||||
}
|
||||
const float* Labels() const { return block->label; }
|
||||
const float* Weights() const { return block->weight; }
|
||||
const uint64_t* Qid() const { return block->qid; }
|
||||
|
||||
size_t Size() const { return block->size; }
|
||||
|
||||
private:
|
||||
const dmlc::RowBlock<uint32_t>* block;
|
||||
size_t row_offset;
|
||||
};
|
||||
|
||||
/** \brief FileAdapter wraps dmlc::parser to read files and provide access in a
|
||||
* common interface. */
|
||||
class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
|
||||
public:
|
||||
explicit FileAdapter(dmlc::Parser<uint32_t>* parser) : parser(parser) {}
|
||||
|
||||
const FileAdapterBatch& Value() const override { return *batch.get(); }
|
||||
void BeforeFirst() override {
|
||||
batch.reset();
|
||||
parser->BeforeFirst();
|
||||
row_offset = 0;
|
||||
}
|
||||
bool Next() override {
|
||||
bool next = parser->Next();
|
||||
batch.reset(new FileAdapterBatch(&parser->Value(), row_offset));
|
||||
row_offset += parser->Value().size;
|
||||
return next;
|
||||
}
|
||||
// Indicates a number of rows/columns must be inferred
|
||||
size_t NumRows() const { return kAdapterUnknownSize; }
|
||||
size_t NumColumns() const { return kAdapterUnknownSize; }
|
||||
|
||||
private:
|
||||
size_t row_offset{0};
|
||||
std::unique_ptr<FileAdapterBatch> batch;
|
||||
dmlc::Parser<uint32_t>* parser;
|
||||
};
|
||||
}; // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_ADAPTER_H_
|
||||
@ -15,6 +15,7 @@
|
||||
#include "../common/io.h"
|
||||
#include "../common/version.h"
|
||||
#include "../common/group_data.h"
|
||||
#include "../data/adapter.h"
|
||||
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
#include "./sparse_page_source.h"
|
||||
@ -207,6 +208,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
LOG(CONSOLE) << "Load part of data " << partid
|
||||
<< " of " << npart << " parts";
|
||||
}
|
||||
|
||||
// legacy handling of binary data loading
|
||||
if (file_format == "auto" && npart == 1) {
|
||||
int magic;
|
||||
@ -214,13 +216,13 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
if (fi != nullptr) {
|
||||
common::PeekableInStream is(fi.get());
|
||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
|
||||
magic == data::SimpleCSRSource::kMagic) {
|
||||
magic == data::SimpleCSRSource::kMagic) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
source->LoadBinary(&is);
|
||||
DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
@ -291,9 +293,9 @@ DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
|
||||
const std::string& cache_prefix,
|
||||
const size_t page_size) {
|
||||
if (cache_prefix.length() == 0) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
source->CopyFrom(parser);
|
||||
return DMatrix::Create(std::move(source), cache_prefix);
|
||||
data::FileAdapter adapter(parser);
|
||||
return DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
1);
|
||||
} else {
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
if (!data::SparsePageSource<SparsePage>::CacheExist(cache_prefix, ".row.page")) {
|
||||
@ -355,9 +357,23 @@ DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
|
||||
#endif // DMLC_ENABLE_STD_THREAD
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
template <typename AdapterT>
|
||||
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread) {
|
||||
return new data::SimpleDMatrix(adapter, missing, nthread);
|
||||
}
|
||||
|
||||
template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter,
|
||||
float missing, int nthread);
|
||||
template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter,
|
||||
float missing, int nthread);
|
||||
template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter,
|
||||
float missing, int nthread);
|
||||
template DMatrix* DMatrix::Create<data::DataTableAdapter>(
|
||||
data::DataTableAdapter* adapter, float missing, int nthread);
|
||||
template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter,
|
||||
float missing, int nthread);
|
||||
|
||||
SparsePage SparsePage::GetTranspose(int num_columns) const {
|
||||
SparsePage transpose;
|
||||
common::ParallelGroupBuilder<Entry, bst_row_t> builder(&transpose.offset.HostVector(),
|
||||
|
||||
@ -6,7 +6,6 @@
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/json.h>
|
||||
|
||||
#include <limits>
|
||||
#include "simple_csr_source.h"
|
||||
#include "columnar.h"
|
||||
|
||||
@ -26,69 +25,6 @@ void SimpleCSRSource::CopyFrom(DMatrix* src) {
|
||||
}
|
||||
}
|
||||
|
||||
void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
|
||||
// use qid to get group info
|
||||
const uint64_t default_max = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t last_group_id = default_max;
|
||||
bst_uint group_size = 0;
|
||||
std::vector<uint64_t> qids;
|
||||
this->Clear();
|
||||
while (parser->Next()) {
|
||||
const dmlc::RowBlock<uint32_t>& batch = parser->Value();
|
||||
if (batch.label != nullptr) {
|
||||
auto& labels = info.labels_.HostVector();
|
||||
labels.insert(labels.end(), batch.label, batch.label + batch.size);
|
||||
}
|
||||
if (batch.weight != nullptr) {
|
||||
auto& weights = info.weights_.HostVector();
|
||||
weights.insert(weights.end(), batch.weight, batch.weight + batch.size);
|
||||
}
|
||||
if (batch.qid != nullptr) {
|
||||
qids.insert(qids.end(), batch.qid, batch.qid + batch.size);
|
||||
// get group
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
const uint64_t cur_group_id = batch.qid[i];
|
||||
if (last_group_id == default_max || last_group_id != cur_group_id) {
|
||||
info.group_ptr_.push_back(group_size);
|
||||
}
|
||||
last_group_id = cur_group_id;
|
||||
++group_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the assertion on batch.index, which can be null in the case that the data in this
|
||||
// batch is entirely sparse. Although it's true that this indicates a likely issue with the
|
||||
// user's data workflows, passing XGBoost entirely sparse data should not cause it to fail.
|
||||
// See https://github.com/dmlc/xgboost/issues/1827 for complete detail.
|
||||
// CHECK(batch.index != nullptr);
|
||||
|
||||
// update information
|
||||
this->info.num_row_ += batch.size;
|
||||
// copy the data over
|
||||
auto& data_vec = page_.data.HostVector();
|
||||
auto& offset_vec = page_.offset.HostVector();
|
||||
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
|
||||
uint32_t index = batch.index[i];
|
||||
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
|
||||
data_vec.emplace_back(index, fvalue);
|
||||
this->info.num_col_ = std::max(this->info.num_col_,
|
||||
static_cast<uint64_t>(index + 1));
|
||||
}
|
||||
size_t top = page_.offset.Size();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
offset_vec.push_back(offset_vec[top - 1] + batch.offset[i + 1] - batch.offset[0]);
|
||||
}
|
||||
}
|
||||
if (last_group_id != default_max) {
|
||||
if (group_size > info.group_ptr_.back()) {
|
||||
info.group_ptr_.push_back(group_size);
|
||||
}
|
||||
}
|
||||
this->info.num_nonzero_ = static_cast<uint64_t>(page_.data.Size());
|
||||
// Either every row has query ID or none at all
|
||||
CHECK(qids.empty() || qids.size() == info.num_row_);
|
||||
}
|
||||
|
||||
void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
|
||||
int tmagic;
|
||||
CHECK(fi->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) << "invalid input file format";
|
||||
|
||||
@ -45,12 +45,7 @@ class SimpleCSRSource : public DataSource<SparsePage> {
|
||||
* \param src source data iter.
|
||||
*/
|
||||
void CopyFrom(DMatrix* src);
|
||||
/*!
|
||||
* \brief copy content of data from parser, also set the additional information.
|
||||
* \param src source data iter.
|
||||
* \param info The additional information reflected in the parser.
|
||||
*/
|
||||
void CopyFrom(dmlc::Parser<uint32_t>* src);
|
||||
|
||||
/*!
|
||||
* \brief copy content of data from foreign **GPU** columnar buffer.
|
||||
* \param interfaces_str JSON representation of cuda array interfaces.
|
||||
|
||||
@ -11,12 +11,15 @@
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "simple_csr_source.h"
|
||||
#include "../common/group_data.h"
|
||||
#include "../common/math.h"
|
||||
#include "adapter.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
@ -26,6 +29,121 @@ class SimpleDMatrix : public DMatrix {
|
||||
explicit SimpleDMatrix(std::unique_ptr<DataSource<SparsePage>>&& source)
|
||||
: source_(std::move(source)) {}
|
||||
|
||||
template <typename AdapterT>
|
||||
explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
// Set number of threads but keep old value so we can reset it after
|
||||
const int nthreadmax = omp_get_max_threads();
|
||||
if (nthread <= 0) nthread = nthreadmax;
|
||||
int nthread_original = omp_get_max_threads();
|
||||
omp_set_num_threads(nthread);
|
||||
|
||||
source_.reset(new SimpleCSRSource());
|
||||
SimpleCSRSource& mat = *reinterpret_cast<SimpleCSRSource*>(source_.get());
|
||||
std::vector<uint64_t> qids;
|
||||
uint64_t default_max = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t last_group_id = default_max;
|
||||
bst_uint group_size = 0;
|
||||
auto& offset_vec = mat.page_.offset.HostVector();
|
||||
auto& data_vec = mat.page_.data.HostVector();
|
||||
uint64_t inferred_num_columns = 0;
|
||||
|
||||
adapter->BeforeFirst();
|
||||
// Iterate over batches of input data
|
||||
while (adapter->Next()) {
|
||||
auto &batch = adapter->Value();
|
||||
common::ParallelGroupBuilder<
|
||||
Entry, std::remove_reference<decltype(offset_vec)>::type::value_type>
|
||||
builder(&offset_vec, &data_vec);
|
||||
builder.InitBudget(0, nthread);
|
||||
|
||||
// First-pass over the batch counting valid elements
|
||||
size_t num_lines = batch.Size();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(num_lines);
|
||||
++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
auto line = batch.GetLine(i);
|
||||
for (auto j = 0ull; j < line.Size(); j++) {
|
||||
auto element = line.GetElement(j);
|
||||
inferred_num_columns =
|
||||
std::max(inferred_num_columns,
|
||||
static_cast<uint64_t>(element.column_idx + 1));
|
||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
||||
builder.AddBudget(element.row_idx, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
// Second pass over batch, placing elements in correct position
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(num_lines);
|
||||
++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
auto line = batch.GetLine(i);
|
||||
for (auto j = 0ull; j < line.Size(); j++) {
|
||||
auto element = line.GetElement(j);
|
||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
||||
builder.Push(element.row_idx, Entry(element.column_idx, element.value),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Append meta information if available
|
||||
if (batch.Labels() != nullptr) {
|
||||
auto& labels = mat.info.labels_.HostVector();
|
||||
labels.insert(labels.end(), batch.Labels(), batch.Labels() + batch.Size());
|
||||
}
|
||||
if (batch.Weights() != nullptr) {
|
||||
auto& weights = mat.info.weights_.HostVector();
|
||||
weights.insert(weights.end(), batch.Weights(), batch.Weights() + batch.Size());
|
||||
}
|
||||
if (batch.Qid() != nullptr) {
|
||||
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
|
||||
// get group
|
||||
for (size_t i = 0; i < batch.Size(); ++i) {
|
||||
const uint64_t cur_group_id = batch.Qid()[i];
|
||||
if (last_group_id == default_max || last_group_id != cur_group_id) {
|
||||
mat.info.group_ptr_.push_back(group_size);
|
||||
}
|
||||
last_group_id = cur_group_id;
|
||||
++group_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (last_group_id != default_max) {
|
||||
if (group_size > mat.info.group_ptr_.back()) {
|
||||
mat.info.group_ptr_.push_back(group_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Deal with empty rows/columns if necessary
|
||||
if (adapter->NumColumns() == kAdapterUnknownSize) {
|
||||
mat.info.num_col_ = inferred_num_columns;
|
||||
} else {
|
||||
mat.info.num_col_ = adapter->NumColumns();
|
||||
}
|
||||
// Synchronise worker columns
|
||||
rabit::Allreduce<rabit::op::Max>(&mat.info.num_col_, 1);
|
||||
|
||||
if (adapter->NumRows() == kAdapterUnknownSize) {
|
||||
mat.info.num_row_ = offset_vec.size() - 1;
|
||||
} else {
|
||||
if (offset_vec.empty()) {
|
||||
offset_vec.emplace_back(0);
|
||||
}
|
||||
|
||||
while (offset_vec.size() - 1 < adapter->NumRows()) {
|
||||
offset_vec.emplace_back(offset_vec.back());
|
||||
}
|
||||
mat.info.num_row_ = adapter->NumRows();
|
||||
}
|
||||
mat.info.num_nonzero_ = data_vec.size();
|
||||
omp_set_num_threads(nthread_original);
|
||||
}
|
||||
|
||||
MetaInfo& Info() override;
|
||||
|
||||
const MetaInfo& Info() const override;
|
||||
|
||||
54
tests/cpp/common/test_group_data.cc
Normal file
54
tests/cpp/common/test_group_data.cc
Normal file
@ -0,0 +1,54 @@
|
||||
/*!
|
||||
* Copyright 2019 by Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/common/group_data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(group_data, ParallelGroupBuilder) {
|
||||
std::vector<size_t> offsets;
|
||||
std::vector<Entry> data;
|
||||
ParallelGroupBuilder<Entry, size_t> builder(&offsets, &data);
|
||||
builder.InitBudget(0, 1);
|
||||
// Add two rows with two elements each
|
||||
builder.AddBudget(0, 0, 2);
|
||||
builder.AddBudget(1, 0, 2);
|
||||
|
||||
builder.InitStorage();
|
||||
builder.Push(0, Entry(0, 0), 0);
|
||||
builder.Push(0, Entry(1, 1), 0);
|
||||
builder.Push(1, Entry(0, 2), 0);
|
||||
builder.Push(1, Entry(1, 3), 0);
|
||||
|
||||
std::vector<Entry> expected_data{
|
||||
Entry(0, 0),
|
||||
Entry(1, 1),
|
||||
Entry(0, 2),
|
||||
Entry(1, 3),
|
||||
};
|
||||
std::vector<size_t> expected_offsets{0, 2, 4};
|
||||
|
||||
EXPECT_EQ(data, expected_data);
|
||||
EXPECT_EQ(offsets, expected_offsets);
|
||||
|
||||
// Create new builder, add one more row given already populated offsets/data
|
||||
ParallelGroupBuilder<Entry, size_t> builder2(&offsets, &data);
|
||||
builder2.InitBudget(0, 1);
|
||||
builder2.AddBudget(2, 0, 2);
|
||||
builder2.InitStorage();
|
||||
builder2.Push(2, Entry(0, 4), 0);
|
||||
builder2.Push(2, Entry(1, 5), 0);
|
||||
|
||||
expected_data.emplace_back(Entry(0, 4));
|
||||
expected_data.emplace_back(Entry(1, 5));
|
||||
expected_offsets.emplace_back(6);
|
||||
|
||||
EXPECT_EQ(data, expected_data);
|
||||
EXPECT_EQ(offsets, expected_offsets);
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
104
tests/cpp/data/test_adapter.cc
Normal file
104
tests/cpp/data/test_adapter.cc
Normal file
@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/c_api.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/version_config.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
#include "../../../src/common/timer.h"
|
||||
#include "../helpers.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
TEST(c_api, CSRAdapter) {
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0, 1, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 4, 5};
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(),
|
||||
row_ptr.size() - 1, data.size(), n);
|
||||
adapter.Next();
|
||||
auto & batch = adapter.Value();
|
||||
auto line0 = batch.GetLine(0);
|
||||
EXPECT_EQ(line0.GetElement(0).value, 1);
|
||||
EXPECT_EQ(line0.GetElement(1).value, 2);
|
||||
|
||||
auto line1 = batch.GetLine(1);
|
||||
EXPECT_EQ(line1 .GetElement(0).value, 3);
|
||||
EXPECT_EQ(line1 .GetElement(1).value, 4);
|
||||
auto line2 = batch.GetLine(2);
|
||||
EXPECT_EQ(line2 .GetElement(0).value, 5);
|
||||
EXPECT_EQ(line2 .GetElement(0).row_idx, 2);
|
||||
EXPECT_EQ(line2 .GetElement(0).column_idx, 1);
|
||||
|
||||
data::SimpleDMatrix dmat(&adapter, -1, std::nan(""));
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for(auto j = 0ull; j < inst.size(); j++)
|
||||
{
|
||||
EXPECT_EQ(inst[j].fvalue, data[row_ptr[i] + j]);
|
||||
EXPECT_EQ(inst[j].index, feature_idx[row_ptr[i] + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST(c_api, DenseAdapter) {
|
||||
int m = 3;
|
||||
int n = 2;
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6};
|
||||
data::DenseAdapter adapter(data.data(), m, m*n, n);
|
||||
data::SimpleDMatrix dmat(&adapter,-1,std::numeric_limits<float>::quiet_NaN());
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 6);
|
||||
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for(auto j = 0ull; j < inst.size(); j++)
|
||||
{
|
||||
EXPECT_EQ(inst[j].fvalue, data[i*n+j]);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(c_api, CSCAdapter) {
|
||||
std::vector<float> data = {1, 3, 2, 4, 5};
|
||||
std::vector<unsigned> row_idx = {0, 1, 0, 1, 2};
|
||||
std::vector<size_t> col_ptr = {0, 2, 5};
|
||||
data::CSCAdapter adapter(col_ptr.data(), row_idx.data(), data.data(), 2, 3);
|
||||
data::SimpleDMatrix dmat(&adapter,-1,std::numeric_limits<float>::quiet_NaN());
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, 3);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, 5);
|
||||
|
||||
auto &batch = *dmat.GetBatches<SparsePage>().begin();
|
||||
auto inst = batch[0];
|
||||
EXPECT_EQ(inst[0].fvalue, 1);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 2);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = batch[1];
|
||||
EXPECT_EQ(inst[0].fvalue, 3);
|
||||
EXPECT_EQ(inst[0].index, 0);
|
||||
EXPECT_EQ(inst[1].fvalue, 4);
|
||||
EXPECT_EQ(inst[1].index, 1);
|
||||
|
||||
inst = batch[2];
|
||||
EXPECT_EQ(inst[0].fvalue, 5);
|
||||
EXPECT_EQ(inst[0].index, 1);
|
||||
}
|
||||
|
||||
TEST(c_api, FileAdapter) {
|
||||
std::string filename = "test.libsvm";
|
||||
CreateBigTestData(filename, 10);
|
||||
std::unique_ptr<dmlc::Parser<uint32_t>> parser(dmlc::Parser<uint32_t>::Create(filename.c_str(), 0, 1,"auto"));
|
||||
data::FileAdapter adapter(parser.get());
|
||||
}
|
||||
@ -101,7 +101,6 @@ TEST(DMatrix, Uri) {
|
||||
std::string path = tmpdir.path + "/small.csv";
|
||||
|
||||
std::ofstream fout(path);
|
||||
ASSERT_TRUE(fout);
|
||||
size_t i = 0;
|
||||
for (size_t r = 0; r < kRows; ++r) {
|
||||
for (size_t c = 0; c < kCols; ++c) {
|
||||
|
||||
@ -4,6 +4,9 @@
|
||||
#include "../../../src/data/simple_dmatrix.h"
|
||||
|
||||
#include "../helpers.h"
|
||||
#include "../../../src/data/adapter.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
TEST(SimpleDMatrix, MetaInfo) {
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
@ -63,3 +66,63 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
||||
EXPECT_EQ(num_col_batch, 1) << "Expected number of batches to be 1";
|
||||
delete dmat;
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, Empty) {
|
||||
std::vector<float> data{};
|
||||
std::vector<unsigned> feature_idx = {};
|
||||
std::vector<size_t> row_ptr = {};
|
||||
|
||||
data::CSRAdapter csr_adapter(row_ptr.data(), feature_idx.data(), data.data(), 0, 0, 0);
|
||||
data::SimpleDMatrix dmat(&csr_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
|
||||
data::DenseAdapter dense_adapter(nullptr, 0, 0, 0);
|
||||
dmat = data::SimpleDMatrix(&dense_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
|
||||
data::CSCAdapter csc_adapter(nullptr, nullptr, nullptr, 0, 0);
|
||||
dmat = data::SimpleDMatrix(&csc_adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 0);
|
||||
CHECK_EQ(dmat.Info().num_row_, 0);
|
||||
CHECK_EQ(dmat.Info().num_col_, 0);
|
||||
for (auto &batch : dmat.GetBatches<SparsePage>()) {
|
||||
CHECK_EQ(batch.Size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, MissingData) {
|
||||
std::vector<float> data{0.0, std::nanf(""), 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1, 0};
|
||||
std::vector<size_t> row_ptr = {0, 2, 3};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 3, 2);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 2);
|
||||
dmat = data::SimpleDMatrix(&adapter, 1.0, 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 1);
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, EmptyRow) {
|
||||
std::vector<float> data{0.0, 1.0};
|
||||
std::vector<unsigned> feature_idx = {0, 1};
|
||||
std::vector<size_t> row_ptr = {0, 2, 2};
|
||||
|
||||
data::CSRAdapter adapter(row_ptr.data(), feature_idx.data(), data.data(), 2, 2, 2);
|
||||
data::SimpleDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(), 1);
|
||||
CHECK_EQ(dmat.Info().num_nonzero_, 2);
|
||||
CHECK_EQ(dmat.Info().num_row_, 2);
|
||||
CHECK_EQ(dmat.Info().num_col_, 2);
|
||||
}
|
||||
|
||||
@ -64,25 +64,6 @@ class TestBasic(unittest.TestCase):
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||
|
||||
def test_np_view(self):
|
||||
# Sliced Float32 array
|
||||
y = np.array([12, 34, 56], np.float32)[::2]
|
||||
from_view = xgb.DMatrix(np.array([[]]), label=y).get_label()
|
||||
from_array = xgb.DMatrix(np.array([[]]), label=y + 0).get_label()
|
||||
assert (from_view.shape == from_array.shape)
|
||||
assert (from_view == from_array).all()
|
||||
|
||||
# Sliced UInt array
|
||||
z = np.array([12, 34, 56], np.uint32)[::2]
|
||||
dmat = xgb.DMatrix(np.array([[]]))
|
||||
dmat.set_uint_info('root_index', z)
|
||||
from_view = dmat.get_uint_info('root_index')
|
||||
dmat = xgb.DMatrix(np.array([[]]))
|
||||
dmat.set_uint_info('root_index', z + 0)
|
||||
from_array = dmat.get_uint_info('root_index')
|
||||
assert (from_view.shape == from_array.shape)
|
||||
assert (from_view == from_array).all()
|
||||
|
||||
def test_record_results(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||
@ -127,72 +108,6 @@ class TestBasic(unittest.TestCase):
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||
|
||||
def test_dmatrix_init(self):
|
||||
data = np.random.randn(5, 5)
|
||||
|
||||
# different length
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=list('abcdef'))
|
||||
# contains duplicates
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=['a', 'b', 'c', 'd', 'd'])
|
||||
# contains symbol
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=['a', 'b', 'c', 'd', 'e<1'])
|
||||
|
||||
dm = xgb.DMatrix(data)
|
||||
dm.feature_names = list('abcde')
|
||||
assert dm.feature_names == list('abcde')
|
||||
|
||||
assert dm.slice([0, 1]).feature_names == dm.feature_names
|
||||
|
||||
dm.feature_types = 'q'
|
||||
assert dm.feature_types == list('qqqqq')
|
||||
|
||||
dm.feature_types = list('qiqiq')
|
||||
assert dm.feature_types == list('qiqiq')
|
||||
|
||||
def incorrect_type_set():
|
||||
dm.feature_types = list('abcde')
|
||||
|
||||
self.assertRaises(ValueError, incorrect_type_set)
|
||||
|
||||
# reset
|
||||
dm.feature_names = None
|
||||
self.assertEqual(dm.feature_names, ['f0', 'f1', 'f2', 'f3', 'f4'])
|
||||
assert dm.feature_types is None
|
||||
|
||||
def test_feature_names(self):
|
||||
data = np.random.randn(100, 5)
|
||||
target = np.array([0, 1] * 50)
|
||||
|
||||
cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
|
||||
[u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']]
|
||||
|
||||
for features in cases:
|
||||
dm = xgb.DMatrix(data, label=target,
|
||||
feature_names=features)
|
||||
assert dm.feature_names == features
|
||||
assert dm.num_row() == 100
|
||||
assert dm.num_col() == 5
|
||||
|
||||
params = {'objective': 'multi:softprob',
|
||||
'eval_metric': 'mlogloss',
|
||||
'eta': 0.3,
|
||||
'num_class': 3}
|
||||
|
||||
bst = xgb.train(params, dm, num_boost_round=10)
|
||||
scores = bst.get_fscore()
|
||||
assert list(sorted(k for k in scores)) == features
|
||||
|
||||
dummy = np.random.randn(5, 5)
|
||||
dm = xgb.DMatrix(dummy, feature_names=features)
|
||||
bst.predict(dm)
|
||||
|
||||
# different feature name must raises error
|
||||
dm = xgb.DMatrix(dummy, feature_names=list('abcde'))
|
||||
self.assertRaises(ValueError, bst.predict, dm)
|
||||
|
||||
def test_dump(self):
|
||||
data = np.random.randn(100, 2)
|
||||
target = np.array([0, 1] * 50)
|
||||
@ -250,27 +165,6 @@ class TestBasic(unittest.TestCase):
|
||||
assert dm.num_row() == row
|
||||
assert dm.num_col() == cols
|
||||
|
||||
def test_dmatrix_numpy_init(self):
|
||||
data = np.random.randn(5, 5)
|
||||
dm = xgb.DMatrix(data)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5
|
||||
|
||||
data = np.array([[1, 2], [3, 4]])
|
||||
dm = xgb.DMatrix(data)
|
||||
assert dm.num_row() == 2
|
||||
assert dm.num_col() == 2
|
||||
|
||||
# 0d array
|
||||
self.assertRaises(ValueError, xgb.DMatrix, np.array(1))
|
||||
# 1d array
|
||||
self.assertRaises(ValueError, xgb.DMatrix, np.array([1, 2, 3]))
|
||||
# 3d array
|
||||
data = np.random.randn(5, 5, 5)
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data)
|
||||
# object dtype
|
||||
data = np.array([['a', 'b'], ['c', 'd']])
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data)
|
||||
|
||||
def test_cv(self):
|
||||
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
@ -336,12 +230,6 @@ class TestBasic(unittest.TestCase):
|
||||
' dtype=float32)]')
|
||||
assert output == solution
|
||||
|
||||
def test_get_info(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain.get_float_info('label')
|
||||
dtrain.get_float_info('weight')
|
||||
dtrain.get_float_info('base_margin')
|
||||
dtrain.get_uint_info('root_index')
|
||||
|
||||
|
||||
class TestBasicPathLike(unittest.TestCase):
|
||||
|
||||
171
tests/python/test_dmatrix.py
Normal file
171
tests/python/test_dmatrix.py
Normal file
@ -0,0 +1,171 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import unittest
|
||||
import scipy.sparse
|
||||
from scipy.sparse import rand
|
||||
|
||||
rng = np.random.RandomState(1)
|
||||
|
||||
dpath = 'demo/data/'
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
class TestDMatrix(unittest.TestCase):
|
||||
def test_dmatrix_numpy_init(self):
|
||||
data = np.random.randn(5, 5)
|
||||
dm = xgb.DMatrix(data)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5
|
||||
|
||||
data = np.array([[1, 2], [3, 4]])
|
||||
dm = xgb.DMatrix(data)
|
||||
assert dm.num_row() == 2
|
||||
assert dm.num_col() == 2
|
||||
|
||||
# 0d array
|
||||
self.assertRaises(ValueError, xgb.DMatrix, np.array(1))
|
||||
# 1d array
|
||||
self.assertRaises(ValueError, xgb.DMatrix, np.array([1, 2, 3]))
|
||||
# 3d array
|
||||
data = np.random.randn(5, 5, 5)
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data)
|
||||
# object dtype
|
||||
data = np.array([['a', 'b'], ['c', 'd']])
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data)
|
||||
|
||||
def test_csr(self):
|
||||
indptr = np.array([0, 2, 3, 6])
|
||||
indices = np.array([0, 2, 2, 0, 1, 2])
|
||||
data = np.array([1, 2, 3, 4, 5, 6])
|
||||
X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
|
||||
dtrain = xgb.DMatrix(X)
|
||||
assert dtrain.num_row() == 3
|
||||
assert dtrain.num_col() == 3
|
||||
|
||||
def test_csc(self):
|
||||
row = np.array([0, 2, 2, 0, 1, 2])
|
||||
col = np.array([0, 0, 1, 2, 2, 2])
|
||||
data = np.array([1, 2, 3, 4, 5, 6])
|
||||
X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
|
||||
dtrain = xgb.DMatrix(X)
|
||||
assert dtrain.num_row() == 3
|
||||
assert dtrain.num_col() == 3
|
||||
|
||||
def test_np_view(self):
|
||||
# Sliced Float32 array
|
||||
y = np.array([12, 34, 56], np.float32)[::2]
|
||||
from_view = xgb.DMatrix(np.array([[]]), label=y).get_label()
|
||||
from_array = xgb.DMatrix(np.array([[]]), label=y + 0).get_label()
|
||||
assert (from_view.shape == from_array.shape)
|
||||
assert (from_view == from_array).all()
|
||||
|
||||
# Sliced UInt array
|
||||
z = np.array([12, 34, 56], np.uint32)[::2]
|
||||
dmat = xgb.DMatrix(np.array([[]]))
|
||||
dmat.set_uint_info('root_index', z)
|
||||
from_view = dmat.get_uint_info('root_index')
|
||||
dmat = xgb.DMatrix(np.array([[]]))
|
||||
dmat.set_uint_info('root_index', z + 0)
|
||||
from_array = dmat.get_uint_info('root_index')
|
||||
assert (from_view.shape == from_array.shape)
|
||||
assert (from_view == from_array).all()
|
||||
|
||||
def test_feature_names(self):
|
||||
data = np.random.randn(5, 5)
|
||||
|
||||
# different length
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=list('abcdef'))
|
||||
# contains duplicates
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=['a', 'b', 'c', 'd', 'd'])
|
||||
# contains symbol
|
||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||
feature_names=['a', 'b', 'c', 'd', 'e<1'])
|
||||
|
||||
dm = xgb.DMatrix(data)
|
||||
dm.feature_names = list('abcde')
|
||||
assert dm.feature_names == list('abcde')
|
||||
|
||||
assert dm.slice([0, 1]).feature_names == dm.feature_names
|
||||
|
||||
dm.feature_types = 'q'
|
||||
assert dm.feature_types == list('qqqqq')
|
||||
|
||||
dm.feature_types = list('qiqiq')
|
||||
assert dm.feature_types == list('qiqiq')
|
||||
|
||||
def incorrect_type_set():
|
||||
dm.feature_types = list('abcde')
|
||||
|
||||
self.assertRaises(ValueError, incorrect_type_set)
|
||||
|
||||
# reset
|
||||
dm.feature_names = None
|
||||
self.assertEqual(dm.feature_names, ['f0', 'f1', 'f2', 'f3', 'f4'])
|
||||
assert dm.feature_types is None
|
||||
|
||||
def test_feature_names(self):
|
||||
data = np.random.randn(100, 5)
|
||||
target = np.array([0, 1] * 50)
|
||||
|
||||
cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
|
||||
[u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']]
|
||||
|
||||
for features in cases:
|
||||
dm = xgb.DMatrix(data, label=target,
|
||||
feature_names=features)
|
||||
assert dm.feature_names == features
|
||||
assert dm.num_row() == 100
|
||||
assert dm.num_col() == 5
|
||||
|
||||
params = {'objective': 'multi:softprob',
|
||||
'eval_metric': 'mlogloss',
|
||||
'eta': 0.3,
|
||||
'num_class': 3}
|
||||
|
||||
bst = xgb.train(params, dm, num_boost_round=10)
|
||||
scores = bst.get_fscore()
|
||||
assert list(sorted(k for k in scores)) == features
|
||||
|
||||
dummy = np.random.randn(5, 5)
|
||||
dm = xgb.DMatrix(dummy, feature_names=features)
|
||||
bst.predict(dm)
|
||||
|
||||
# different feature name must raises error
|
||||
dm = xgb.DMatrix(dummy, feature_names=list('abcde'))
|
||||
self.assertRaises(ValueError, bst.predict, dm)
|
||||
|
||||
def test_get_info(self):
|
||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
dtrain.get_float_info('label')
|
||||
dtrain.get_float_info('weight')
|
||||
dtrain.get_float_info('base_margin')
|
||||
dtrain.get_uint_info('root_index')
|
||||
|
||||
def test_sparse_dmatrix_csr(self):
|
||||
nrow = 100
|
||||
ncol = 1000
|
||||
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
||||
assert x.indices.max() < ncol - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
|
||||
def test_sparse_dmatrix_csc(self):
|
||||
nrow = 1000
|
||||
ncol = 100
|
||||
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
|
||||
assert x.indices.max() < nrow - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
@ -1,33 +0,0 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from scipy.sparse import rand
|
||||
|
||||
rng = np.random.RandomState(1)
|
||||
|
||||
param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
|
||||
|
||||
|
||||
def test_sparse_dmatrix_csr():
|
||||
nrow = 100
|
||||
ncol = 1000
|
||||
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
||||
assert x.indices.max() < ncol - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
|
||||
|
||||
def test_sparse_dmatrix_csc():
|
||||
nrow = 1000
|
||||
ncol = 100
|
||||
x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
|
||||
assert x.indices.max() < nrow - 1
|
||||
x.data[:] = 1
|
||||
dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
|
||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||
watchlist = [(dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, 5, watchlist)
|
||||
bst.predict(dtrain)
|
||||
Loading…
x
Reference in New Issue
Block a user