External data adapters (#5044)

* Use external data adapters as lightweight intermediate layer between external data and DMatrix
This commit is contained in:
Rory Mitchell
2019-12-04 10:56:17 +13:00
committed by GitHub
parent f2277e7106
commit e3c34c79be
15 changed files with 1058 additions and 593 deletions

488
src/data/adapter.h Normal file
View File

@@ -0,0 +1,488 @@
/*!
* Copyright (c) 2019 by Contributors
* \file adapter.h
*/
#ifndef XGBOOST_DATA_ADAPTER_H_
#define XGBOOST_DATA_ADAPTER_H_
#include <limits>
#include <memory>
#include <string>
namespace xgboost {
namespace data {
/** External data formats should implement an adapter as below. The
* adapter provides a uniform access to data outside xgboost, allowing
* construction of DMatrix objects from a range of sources without duplicating
* code.
*
* The adapter object is an iterator that returns batches of data. Each batch
* contains a number of "lines". A line represents a set of elements from a
* sparse input matrix, normally a row in the case of a CSR matrix or a column
* for a CSC matrix. Typically in sparse matrix formats we can efficiently
* access subsets of elements at a time, but cannot efficiently lookups elements
* by random access, hence the "line" abstraction, allowing the sparse matrix to
* return subsets of elements efficiently. Individual elements are described by
* a COO tuple (row index, column index, value).
*
* This abstraction allows us to read through different sparse matrix formats
* using the same interface. In particular we can write a DMatrix constructor
* that uses the same code to construct itself from a CSR matrix, CSC matrix,
* dense matrix, csv, libsvm file, or potentially other formats. To see why this
* is necessary, imagine we have 5 external matrix formats and 5 internal
* DMatrix types where each DMatrix needs a custom constructor for each possible
* input. The number of constructors is 5*5=25. Using an abstraction over the
* input data types the number of constructors is reduced to 5, as each DMatrix
* is oblivious to the external data format. Adding a new input source is simply
* a case of implementing an adapter.
*
* Most of the below adapters do not need more than one batch as the data
* originates from an in memory source. The file adapter does require batches to
* avoid loading the entire file in memory.
*
* An important detail is empty row/column handling. Files loaded from disk do
* not provide meta information about the number of rows/columns to expect, this
* needs to be inferred during construction. Other sparse formats may specify a
* number of rows/columns, but we can encounter entirely sparse rows or columns,
* leading to disagreement between the inferred number and the meta-info
* provided. To resolve this, adapters have methods specifying the number of
* rows/columns expected, these methods may return zero where these values must
* be inferred from data. A constructed DMatrix should agree with the input
* source on numbers of rows/columns, appending empty rows if necessary.
* */
/** \brief An adapter can return this value for number of rows or columns
* indicating that this value is currently unknown and should be inferred while
* passing over the data. */
constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
struct COOTuple {
COOTuple(size_t row_idx, size_t column_idx, float value)
: row_idx(row_idx), column_idx(column_idx), value(value) {}
size_t row_idx{0};
size_t column_idx{0};
float value{0};
};
namespace detail {
/**
* \brief Simplifies the use of DataIter when there is only one batch.
*/
template <typename DType>
class SingleBatchDataIter : dmlc::DataIter<DType> {
public:
void BeforeFirst() override { counter = 0; }
bool Next() override {
if (counter == 0) {
counter++;
return true;
}
return false;
}
private:
int counter{0};
};
/** \brief Indicates this data source cannot contain meta-info such as labels,
* weights or qid. */
class NoMetaInfo {
public:
const float* Labels() const { return nullptr; }
const float* Weights() const { return nullptr; }
const uint64_t* Qid() const { return nullptr; }
};
}; // namespace detail
class CSRAdapterBatch : public detail::NoMetaInfo {
public:
class Line {
public:
Line(size_t row_idx, size_t size, const unsigned* feature_idx,
const float* values)
: row_idx(row_idx),
size(size),
feature_idx(feature_idx),
values(values) {}
size_t Size() const { return size; }
COOTuple GetElement(size_t idx) const {
return COOTuple(row_idx, feature_idx[idx], values[idx]);
}
private:
size_t row_idx;
size_t size;
const unsigned* feature_idx;
const float* values;
};
CSRAdapterBatch(const size_t* row_ptr, const unsigned* feature_idx,
const float* values, size_t num_rows, size_t num_elements,
size_t num_features)
: row_ptr(row_ptr),
feature_idx(feature_idx),
values(values),
num_rows(num_rows),
num_elements(num_elements),
num_features(num_features) {}
const Line GetLine(size_t idx) const {
size_t begin_offset = row_ptr[idx];
size_t end_offset = row_ptr[idx + 1];
return Line(idx, end_offset - begin_offset, &feature_idx[begin_offset],
&values[begin_offset]);
}
size_t Size() const { return num_rows; }
private:
const size_t* row_ptr;
const unsigned* feature_idx;
const float* values;
size_t num_elements;
size_t num_rows;
size_t num_features;
};
class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
public:
CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx,
const float* values, size_t num_rows, size_t num_elements,
size_t num_features)
: batch(row_ptr, feature_idx, values, num_rows, num_elements,
num_features),
num_rows(num_rows),
num_columns(num_features) {}
const CSRAdapterBatch& Value() const override { return batch; }
size_t NumRows() const { return num_rows; }
size_t NumColumns() const { return num_columns; }
private:
CSRAdapterBatch batch;
size_t num_rows;
size_t num_columns;
};
class DenseAdapterBatch : public detail::NoMetaInfo {
public:
DenseAdapterBatch(const float* values, size_t num_rows, size_t num_elements,
size_t num_features)
: num_features(num_features),
num_rows(num_rows),
num_elements(num_elements),
values(values) {}
private:
class Line {
public:
Line(const float* values, size_t size, size_t row_idx)
: row_idx(row_idx), size(size), values(values) {}
size_t Size() const { return size; }
COOTuple GetElement(size_t idx) const {
return COOTuple(row_idx, idx, values[idx]);
}
private:
size_t row_idx;
size_t size;
const float* values;
};
public:
size_t Size() const { return num_rows; }
const Line GetLine(size_t idx) const {
return Line(values + idx * num_features, num_features, idx);
}
private:
const float* values;
size_t num_elements;
size_t num_rows;
size_t num_features;
};
class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
public:
DenseAdapter(const float* values, size_t num_rows, size_t num_elements,
size_t num_features)
: batch(values, num_rows, num_elements, num_features),
num_rows(num_rows),
num_columns(num_features) {}
const DenseAdapterBatch& Value() const override { return batch; }
size_t NumRows() const { return num_rows; }
size_t NumColumns() const { return num_columns; }
private:
DenseAdapterBatch batch;
size_t num_rows;
size_t num_columns;
};
class CSCAdapterBatch : public detail::NoMetaInfo {
public:
CSCAdapterBatch(const size_t* col_ptr, const unsigned* row_idx,
const float* values, size_t num_features)
: col_ptr(col_ptr),
row_idx(row_idx),
values(values),
num_features(num_features) {}
private:
class Line {
public:
Line(size_t col_idx, size_t size, const unsigned* row_idx,
const float* values)
: col_idx(col_idx), size(size), row_idx(row_idx), values(values) {}
size_t Size() const { return size; }
COOTuple GetElement(size_t idx) const {
return COOTuple(row_idx[idx], col_idx, values[idx]);
}
private:
size_t col_idx;
size_t size;
const unsigned* row_idx;
const float* values;
};
public:
size_t Size() const { return num_features; }
const Line GetLine(size_t idx) const {
size_t begin_offset = col_ptr[idx];
size_t end_offset = col_ptr[idx + 1];
return Line(idx, end_offset - begin_offset, &row_idx[begin_offset],
&values[begin_offset]);
}
private:
const size_t* col_ptr;
const unsigned* row_idx;
const float* values;
size_t num_features;
};
class CSCAdapter : public detail::SingleBatchDataIter<CSCAdapterBatch> {
public:
CSCAdapter(const size_t* col_ptr, const unsigned* row_idx,
const float* values, size_t num_features, size_t num_rows)
: batch(col_ptr, row_idx, values, num_features),
num_rows(num_rows),
num_columns(num_features) {}
const CSCAdapterBatch& Value() const override { return batch; }
// JVM package sends 0 as unknown
size_t NumRows() const {
return num_rows == 0 ? kAdapterUnknownSize : num_rows;
}
size_t NumColumns() const { return num_columns; }
private:
CSCAdapterBatch batch;
size_t num_rows;
size_t num_columns;
};
class DataTableAdapterBatch : public detail::NoMetaInfo {
public:
DataTableAdapterBatch(void** data, const char** feature_stypes,
size_t num_rows, size_t num_features)
: data(data),
feature_stypes(feature_stypes),
num_features(num_features),
num_rows(num_rows) {}
private:
enum class DTType : uint8_t {
kFloat32 = 0,
kFloat64 = 1,
kBool8 = 2,
kInt32 = 3,
kInt8 = 4,
kInt16 = 5,
kInt64 = 6,
kUnknown = 7
};
DTType DTGetType(std::string type_string) const {
if (type_string == "float32") {
return DTType::kFloat32;
} else if (type_string == "float64") {
return DTType::kFloat64;
} else if (type_string == "bool8") {
return DTType::kBool8;
} else if (type_string == "int32") {
return DTType::kInt32;
} else if (type_string == "int8") {
return DTType::kInt8;
} else if (type_string == "int16") {
return DTType::kInt16;
} else if (type_string == "int64") {
return DTType::kInt64;
} else {
LOG(FATAL) << "Unknown data table type.";
return DTType::kUnknown;
}
}
class Line {
float DTGetValue(const void* column, DTType dt_type, size_t ridx) const {
float missing = std::numeric_limits<float>::quiet_NaN();
switch (dt_type) {
case DTType::kFloat32: {
float val = reinterpret_cast<const float*>(column)[ridx];
return std::isfinite(val) ? val : missing;
}
case DTType::kFloat64: {
double val = reinterpret_cast<const double*>(column)[ridx];
return std::isfinite(val) ? static_cast<float>(val) : missing;
}
case DTType::kBool8: {
bool val = reinterpret_cast<const bool*>(column)[ridx];
return static_cast<float>(val);
}
case DTType::kInt32: {
int32_t val = reinterpret_cast<const int32_t*>(column)[ridx];
return val != (-2147483647 - 1) ? static_cast<float>(val) : missing;
}
case DTType::kInt8: {
int8_t val = reinterpret_cast<const int8_t*>(column)[ridx];
return val != -128 ? static_cast<float>(val) : missing;
}
case DTType::kInt16: {
int16_t val = reinterpret_cast<const int16_t*>(column)[ridx];
return val != -32768 ? static_cast<float>(val) : missing;
}
case DTType::kInt64: {
int64_t val = reinterpret_cast<const int64_t*>(column)[ridx];
return val != -9223372036854775807 - 1 ? static_cast<float>(val)
: missing;
}
default: {
LOG(FATAL) << "Unknown data table type.";
return 0.0f;
}
}
}
public:
Line(DTType type, size_t size, size_t column_idx, const void* column)
: type(type), size(size), column_idx(column_idx), column(column) {}
size_t Size() const { return size; }
COOTuple GetElement(size_t idx) const {
return COOTuple(idx, column_idx, DTGetValue(column, type, idx));
}
private:
DTType type;
size_t size;
size_t column_idx;
const void* column;
};
public:
size_t Size() const { return num_features; }
const Line GetLine(size_t idx) const {
return Line(DTGetType(feature_stypes[idx]), num_rows, idx, data[idx]);
}
private:
void** data;
const char** feature_stypes;
size_t num_features;
size_t num_rows;
};
class DataTableAdapter
: public detail::SingleBatchDataIter<DataTableAdapterBatch> {
public:
DataTableAdapter(void** data, const char** feature_stypes, size_t num_rows,
size_t num_features)
: batch(data, feature_stypes, num_rows, num_features),
num_rows(num_rows),
num_columns(num_features) {}
const DataTableAdapterBatch& Value() const override { return batch; }
size_t NumRows() const { return num_rows; }
size_t NumColumns() const { return num_columns; }
private:
DataTableAdapterBatch batch;
size_t num_rows;
size_t num_columns;
};
class FileAdapterBatch {
public:
class Line {
public:
Line(size_t row_idx, const uint32_t* feature_idx, const float* value,
size_t size)
: row_idx(row_idx),
feature_idx(feature_idx),
value(value),
size(size) {}
size_t Size() { return size; }
COOTuple GetElement(size_t idx) {
float fvalue = value == nullptr ? 1.0f : value[idx];
return COOTuple(row_idx, feature_idx[idx], fvalue);
}
private:
size_t row_idx;
const uint32_t* feature_idx;
const float* value;
size_t size;
};
FileAdapterBatch(const dmlc::RowBlock<uint32_t>* block, size_t row_offset)
: block(block), row_offset(row_offset) {}
Line GetLine(size_t idx) const {
auto begin = block->offset[idx];
auto end = block->offset[idx + 1];
return Line(idx + row_offset, &block->index[begin], &block->value[begin],
end - begin);
}
const float* Labels() const { return block->label; }
const float* Weights() const { return block->weight; }
const uint64_t* Qid() const { return block->qid; }
size_t Size() const { return block->size; }
private:
const dmlc::RowBlock<uint32_t>* block;
size_t row_offset;
};
/** \brief FileAdapter wraps dmlc::parser to read files and provide access in a
* common interface. */
class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
public:
explicit FileAdapter(dmlc::Parser<uint32_t>* parser) : parser(parser) {}
const FileAdapterBatch& Value() const override { return *batch.get(); }
void BeforeFirst() override {
batch.reset();
parser->BeforeFirst();
row_offset = 0;
}
bool Next() override {
bool next = parser->Next();
batch.reset(new FileAdapterBatch(&parser->Value(), row_offset));
row_offset += parser->Value().size;
return next;
}
// Indicates a number of rows/columns must be inferred
size_t NumRows() const { return kAdapterUnknownSize; }
size_t NumColumns() const { return kAdapterUnknownSize; }
private:
size_t row_offset{0};
std::unique_ptr<FileAdapterBatch> batch;
dmlc::Parser<uint32_t>* parser;
};
}; // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_ADAPTER_H_

View File

@@ -15,6 +15,7 @@
#include "../common/io.h"
#include "../common/version.h"
#include "../common/group_data.h"
#include "../data/adapter.h"
#if DMLC_ENABLE_STD_THREAD
#include "./sparse_page_source.h"
@@ -207,6 +208,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
LOG(CONSOLE) << "Load part of data " << partid
<< " of " << npart << " parts";
}
// legacy handling of binary data loading
if (file_format == "auto" && npart == 1) {
int magic;
@@ -214,13 +216,13 @@ DMatrix* DMatrix::Load(const std::string& uri,
if (fi != nullptr) {
common::PeekableInStream is(fi.get());
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
magic == data::SimpleCSRSource::kMagic) {
magic == data::SimpleCSRSource::kMagic) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
source->LoadBinary(&is);
DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
<< dmat->Info().num_nonzero_ << " entries loaded from " << uri;
}
return dmat;
}
@@ -291,9 +293,9 @@ DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
const std::string& cache_prefix,
const size_t page_size) {
if (cache_prefix.length() == 0) {
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
source->CopyFrom(parser);
return DMatrix::Create(std::move(source), cache_prefix);
data::FileAdapter adapter(parser);
return DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
} else {
#if DMLC_ENABLE_STD_THREAD
if (!data::SparsePageSource<SparsePage>::CacheExist(cache_prefix, ".row.page")) {
@@ -355,9 +357,23 @@ DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
#endif // DMLC_ENABLE_STD_THREAD
}
}
} // namespace xgboost
namespace xgboost {
template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread) {
return new data::SimpleDMatrix(adapter, missing, nthread);
}
template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter,
float missing, int nthread);
template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter,
float missing, int nthread);
template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter,
float missing, int nthread);
template DMatrix* DMatrix::Create<data::DataTableAdapter>(
data::DataTableAdapter* adapter, float missing, int nthread);
template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter,
float missing, int nthread);
SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose;
common::ParallelGroupBuilder<Entry, bst_row_t> builder(&transpose.offset.HostVector(),

View File

@@ -6,7 +6,6 @@
#include <xgboost/logging.h>
#include <xgboost/json.h>
#include <limits>
#include "simple_csr_source.h"
#include "columnar.h"
@@ -26,69 +25,6 @@ void SimpleCSRSource::CopyFrom(DMatrix* src) {
}
}
void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
// use qid to get group info
const uint64_t default_max = std::numeric_limits<uint64_t>::max();
uint64_t last_group_id = default_max;
bst_uint group_size = 0;
std::vector<uint64_t> qids;
this->Clear();
while (parser->Next()) {
const dmlc::RowBlock<uint32_t>& batch = parser->Value();
if (batch.label != nullptr) {
auto& labels = info.labels_.HostVector();
labels.insert(labels.end(), batch.label, batch.label + batch.size);
}
if (batch.weight != nullptr) {
auto& weights = info.weights_.HostVector();
weights.insert(weights.end(), batch.weight, batch.weight + batch.size);
}
if (batch.qid != nullptr) {
qids.insert(qids.end(), batch.qid, batch.qid + batch.size);
// get group
for (size_t i = 0; i < batch.size; ++i) {
const uint64_t cur_group_id = batch.qid[i];
if (last_group_id == default_max || last_group_id != cur_group_id) {
info.group_ptr_.push_back(group_size);
}
last_group_id = cur_group_id;
++group_size;
}
}
// Remove the assertion on batch.index, which can be null in the case that the data in this
// batch is entirely sparse. Although it's true that this indicates a likely issue with the
// user's data workflows, passing XGBoost entirely sparse data should not cause it to fail.
// See https://github.com/dmlc/xgboost/issues/1827 for complete detail.
// CHECK(batch.index != nullptr);
// update information
this->info.num_row_ += batch.size;
// copy the data over
auto& data_vec = page_.data.HostVector();
auto& offset_vec = page_.offset.HostVector();
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
uint32_t index = batch.index[i];
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
data_vec.emplace_back(index, fvalue);
this->info.num_col_ = std::max(this->info.num_col_,
static_cast<uint64_t>(index + 1));
}
size_t top = page_.offset.Size();
for (size_t i = 0; i < batch.size; ++i) {
offset_vec.push_back(offset_vec[top - 1] + batch.offset[i + 1] - batch.offset[0]);
}
}
if (last_group_id != default_max) {
if (group_size > info.group_ptr_.back()) {
info.group_ptr_.push_back(group_size);
}
}
this->info.num_nonzero_ = static_cast<uint64_t>(page_.data.Size());
// Either every row has query ID or none at all
CHECK(qids.empty() || qids.size() == info.num_row_);
}
void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
int tmagic;
CHECK(fi->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) << "invalid input file format";

View File

@@ -45,12 +45,7 @@ class SimpleCSRSource : public DataSource<SparsePage> {
* \param src source data iter.
*/
void CopyFrom(DMatrix* src);
/*!
* \brief copy content of data from parser, also set the additional information.
* \param src source data iter.
* \param info The additional information reflected in the parser.
*/
void CopyFrom(dmlc::Parser<uint32_t>* src);
/*!
* \brief copy content of data from foreign **GPU** columnar buffer.
* \param interfaces_str JSON representation of cuda array interfaces.

View File

@@ -11,12 +11,15 @@
#include <xgboost/data.h>
#include <algorithm>
#include <cstring>
#include <memory>
#include <limits>
#include <utility>
#include <vector>
#include "simple_csr_source.h"
#include "../common/group_data.h"
#include "../common/math.h"
#include "adapter.h"
namespace xgboost {
namespace data {
@@ -26,6 +29,121 @@ class SimpleDMatrix : public DMatrix {
explicit SimpleDMatrix(std::unique_ptr<DataSource<SparsePage>>&& source)
: source_(std::move(source)) {}
template <typename AdapterT>
explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
// Set number of threads but keep old value so we can reset it after
const int nthreadmax = omp_get_max_threads();
if (nthread <= 0) nthread = nthreadmax;
int nthread_original = omp_get_max_threads();
omp_set_num_threads(nthread);
source_.reset(new SimpleCSRSource());
SimpleCSRSource& mat = *reinterpret_cast<SimpleCSRSource*>(source_.get());
std::vector<uint64_t> qids;
uint64_t default_max = std::numeric_limits<uint64_t>::max();
uint64_t last_group_id = default_max;
bst_uint group_size = 0;
auto& offset_vec = mat.page_.offset.HostVector();
auto& data_vec = mat.page_.data.HostVector();
uint64_t inferred_num_columns = 0;
adapter->BeforeFirst();
// Iterate over batches of input data
while (adapter->Next()) {
auto &batch = adapter->Value();
common::ParallelGroupBuilder<
Entry, std::remove_reference<decltype(offset_vec)>::type::value_type>
builder(&offset_vec, &data_vec);
builder.InitBudget(0, nthread);
// First-pass over the batch counting valid elements
size_t num_lines = batch.Size();
#pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < static_cast<omp_ulong>(num_lines);
++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j);
inferred_num_columns =
std::max(inferred_num_columns,
static_cast<uint64_t>(element.column_idx + 1));
if (!common::CheckNAN(element.value) && element.value != missing) {
builder.AddBudget(element.row_idx, tid);
}
}
}
builder.InitStorage();
// Second pass over batch, placing elements in correct position
#pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < static_cast<omp_ulong>(num_lines);
++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j);
if (!common::CheckNAN(element.value) && element.value != missing) {
builder.Push(element.row_idx, Entry(element.column_idx, element.value),
tid);
}
}
}
// Append meta information if available
if (batch.Labels() != nullptr) {
auto& labels = mat.info.labels_.HostVector();
labels.insert(labels.end(), batch.Labels(), batch.Labels() + batch.Size());
}
if (batch.Weights() != nullptr) {
auto& weights = mat.info.weights_.HostVector();
weights.insert(weights.end(), batch.Weights(), batch.Weights() + batch.Size());
}
if (batch.Qid() != nullptr) {
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
// get group
for (size_t i = 0; i < batch.Size(); ++i) {
const uint64_t cur_group_id = batch.Qid()[i];
if (last_group_id == default_max || last_group_id != cur_group_id) {
mat.info.group_ptr_.push_back(group_size);
}
last_group_id = cur_group_id;
++group_size;
}
}
}
if (last_group_id != default_max) {
if (group_size > mat.info.group_ptr_.back()) {
mat.info.group_ptr_.push_back(group_size);
}
}
// Deal with empty rows/columns if necessary
if (adapter->NumColumns() == kAdapterUnknownSize) {
mat.info.num_col_ = inferred_num_columns;
} else {
mat.info.num_col_ = adapter->NumColumns();
}
// Synchronise worker columns
rabit::Allreduce<rabit::op::Max>(&mat.info.num_col_, 1);
if (adapter->NumRows() == kAdapterUnknownSize) {
mat.info.num_row_ = offset_vec.size() - 1;
} else {
if (offset_vec.empty()) {
offset_vec.emplace_back(0);
}
while (offset_vec.size() - 1 < adapter->NumRows()) {
offset_vec.emplace_back(offset_vec.back());
}
mat.info.num_row_ = adapter->NumRows();
}
mat.info.num_nonzero_ = data_vec.size();
omp_set_num_threads(nthread_original);
}
MetaInfo& Info() override;
const MetaInfo& Info() const override;