xgboost/src/data/data.cc
Jiaming Yuan 55ee272ea8
Extend array interface to handle ndarray. (#7434)
* Extend array interface to handle ndarray.

The `ArrayInterface` class is extended to support multi-dim array inputs. Previously this
class handles only 2-dim (vector is also matrix).  This PR specifies the expected
dimension at compile-time and the array interface can perform various checks automatically
for input data. Also, adapters like CSR are more rigorous about their input.  Lastly, row
vector and column vector are handled without intervention from the caller.
2021-11-16 09:52:15 +08:00

1099 lines
42 KiB
C++

/*!
* Copyright 2015-2021 by Contributors
* \file data.cc
*/
#include <dmlc/registry.h>
#include <cstring>
#include "dmlc/io.h"
#include "xgboost/data.h"
#include "xgboost/c_api.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/version_config.h"
#include "xgboost/learner.h"
#include "sparse_page_writer.h"
#include "simple_dmatrix.h"
#include "../common/io.h"
#include "../common/math.h"
#include "../common/version.h"
#include "../common/group_data.h"
#include "../common/threading_utils.h"
#include "../data/adapter.h"
#include "../data/iterative_device_dmatrix.h"
#include "file_iterator.h"
#include "validation.h"
#include "./sparse_page_source.h"
#include "./sparse_page_dmatrix.h"
namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::GHistIndexMatrix>);
} // namespace dmlc
namespace {
template <typename T>
void SaveScalarField(dmlc::Stream *strm, const std::string &name,
xgboost::DataType type, const T &field) {
strm->Write(name);
strm->Write(static_cast<uint8_t>(type));
strm->Write(true); // is_scalar=True
strm->Write(field);
}
template <typename T>
void SaveVectorField(dmlc::Stream *strm, const std::string &name,
xgboost::DataType type, std::pair<uint64_t, uint64_t> shape,
const std::vector<T>& field) {
strm->Write(name);
strm->Write(static_cast<uint8_t>(type));
strm->Write(false); // is_scalar=False
strm->Write(shape.first);
strm->Write(shape.second);
strm->Write(field);
}
template <typename T>
void SaveVectorField(dmlc::Stream* strm, const std::string& name,
xgboost::DataType type, std::pair<uint64_t, uint64_t> shape,
const xgboost::HostDeviceVector<T>& field) {
SaveVectorField(strm, name, type, shape, field.ConstHostVector());
}
template <typename T>
void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
xgboost::DataType expected_type, T* field) {
const std::string invalid {"MetaInfo: Invalid format. "};
std::string name;
xgboost::DataType type;
bool is_scalar;
CHECK(strm->Read(&name)) << invalid;
CHECK_EQ(name, expected_name)
<< invalid << " Expected field: " << expected_name << ", got: " << name;
uint8_t type_val;
CHECK(strm->Read(&type_val)) << invalid;
type = static_cast<xgboost::DataType>(type_val);
CHECK(type == expected_type)
<< invalid << "Expected field of type: " << static_cast<int>(expected_type) << ", "
<< "got field type: " << static_cast<int>(type);
CHECK(strm->Read(&is_scalar)) << invalid;
CHECK(is_scalar)
<< invalid << "Expected field " << expected_name << " to be a scalar; got a vector";
CHECK(strm->Read(field)) << invalid;
}
template <typename T>
void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name,
xgboost::DataType expected_type, std::vector<T>* field) {
const std::string invalid {"MetaInfo: Invalid format. "};
std::string name;
xgboost::DataType type;
bool is_scalar;
CHECK(strm->Read(&name)) << invalid;
CHECK_EQ(name, expected_name)
<< invalid << " Expected field: " << expected_name << ", got: " << name;
uint8_t type_val;
CHECK(strm->Read(&type_val)) << invalid;
type = static_cast<xgboost::DataType>(type_val);
CHECK(type == expected_type)
<< invalid << "Expected field of type: " << static_cast<int>(expected_type) << ", "
<< "got field type: " << static_cast<int>(type);
CHECK(strm->Read(&is_scalar)) << invalid;
CHECK(!is_scalar)
<< invalid << "Expected field " << expected_name << " to be a vector; got a scalar";
std::pair<uint64_t, uint64_t> shape;
CHECK(strm->Read(&shape.first));
CHECK(strm->Read(&shape.second));
// TODO(hcho3): this restriction may be lifted, once we add a field with more than 1 column.
CHECK_EQ(shape.second, 1) << invalid << "Number of columns is expected to be 1.";
CHECK(strm->Read(field)) << invalid;
}
template <typename T>
void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name,
xgboost::DataType expected_type,
xgboost::HostDeviceVector<T>* field) {
LoadVectorField(strm, expected_name, expected_type, &field->HostVector());
}
} // anonymous namespace
namespace xgboost {
uint64_t constexpr MetaInfo::kNumField;
// implementation of inline functions
void MetaInfo::Clear() {
num_row_ = num_col_ = num_nonzero_ = 0;
labels_.HostVector().clear();
group_ptr_.clear();
weights_.HostVector().clear();
base_margin_.HostVector().clear();
}
/*
* Binary serialization format for MetaInfo:
*
* | name | type | is_scalar | num_row | num_col | value |
* |--------------------+----------+-----------+---------+---------+-------------------------|
* | num_row | kUInt64 | True | NA | NA | ${num_row_} |
* | num_col | kUInt64 | True | NA | NA | ${num_col_} |
* | num_nonzero | kUInt64 | True | NA | NA | ${num_nonzero_} |
* | labels | kFloat32 | False | ${size} | 1 | ${labels_} |
* | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} |
* | weights | kFloat32 | False | ${size} | 1 | ${weights_} |
* | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} |
* | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} |
* | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} |
* | feature_names | kStr | False | ${size} | 1 | ${feature_names} |
* | feature_types | kStr | False | ${size} | 1 | ${feature_types} |
*
* Note that the scalar fields (is_scalar=True) will have num_row and num_col missing.
* Also notice the difference between the saved name and the name used in `SetInfo':
* the former uses the plural form.
*/
void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
Version::Save(fo);
fo->Write(kNumField);
int field_cnt = 0; // make sure we are actually writing kNumField fields
SaveScalarField(fo, u8"num_row", DataType::kUInt64, num_row_); ++field_cnt;
SaveScalarField(fo, u8"num_col", DataType::kUInt64, num_col_); ++field_cnt;
SaveScalarField(fo, u8"num_nonzero", DataType::kUInt64, num_nonzero_); ++field_cnt;
SaveVectorField(fo, u8"labels", DataType::kFloat32,
{labels_.Size(), 1}, labels_); ++field_cnt;
SaveVectorField(fo, u8"group_ptr", DataType::kUInt32,
{group_ptr_.size(), 1}, group_ptr_); ++field_cnt;
SaveVectorField(fo, u8"weights", DataType::kFloat32,
{weights_.Size(), 1}, weights_); ++field_cnt;
SaveVectorField(fo, u8"base_margin", DataType::kFloat32,
{base_margin_.Size(), 1}, base_margin_); ++field_cnt;
SaveVectorField(fo, u8"labels_lower_bound", DataType::kFloat32,
{labels_lower_bound_.Size(), 1}, labels_lower_bound_); ++field_cnt;
SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32,
{labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt;
SaveVectorField(fo, u8"feature_names", DataType::kStr,
{feature_names.size(), 1}, feature_names); ++field_cnt;
SaveVectorField(fo, u8"feature_types", DataType::kStr,
{feature_type_names.size(), 1}, feature_type_names); ++field_cnt;
CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
}
void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<FeatureType>* types) {
types->clear();
for (auto const &elem : type_names) {
if (elem == "int") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "float") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "i") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "q") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "c") {
types->emplace_back(FeatureType::kCategorical);
} else {
LOG(FATAL) << "All feature_types must be one of {int, float, i, q, c}.";
}
}
}
void MetaInfo::LoadBinary(dmlc::Stream *fi) {
auto version = Version::Load(fi);
auto major = std::get<0>(version);
// MetaInfo is saved in `SparsePageSource'. So the version in MetaInfo represents the
// version of DMatrix.
CHECK_EQ(major, 1) << "Binary DMatrix generated by XGBoost: "
<< Version::String(version) << " is no longer supported. "
<< "Please process and save your data in current version: "
<< Version::String(Version::Self()) << " again.";
const uint64_t expected_num_field = kNumField;
uint64_t num_field { 0 };
CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format";
size_t expected = 0;
if (major == 1 && std::get<1>(version) < 2) {
// feature names and types are added in 1.2
expected = expected_num_field - 2;
} else {
expected = expected_num_field;
}
CHECK_GE(num_field, expected)
<< "MetaInfo: insufficient number of fields (expected at least "
<< expected << " fields, but the binary file only contains " << num_field
<< "fields.)";
if (num_field > expected_num_field) {
LOG(WARNING) << "MetaInfo: the given binary file contains extra fields "
"which will be ignored.";
}
LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_);
LoadScalarField(fi, u8"num_col", DataType::kUInt64, &num_col_);
LoadScalarField(fi, u8"num_nonzero", DataType::kUInt64, &num_nonzero_);
LoadVectorField(fi, u8"labels", DataType::kFloat32, &labels_);
LoadVectorField(fi, u8"group_ptr", DataType::kUInt32, &group_ptr_);
LoadVectorField(fi, u8"weights", DataType::kFloat32, &weights_);
LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_);
LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_);
LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
LoadFeatureType(feature_type_names, &feature_types.HostVector());
}
template <typename T>
std::vector<T> Gather(const std::vector<T> &in, common::Span<int const> ridxs, size_t stride = 1) {
if (in.empty()) {
return {};
}
auto size = ridxs.size();
std::vector<T> out(size * stride);
for (auto i = 0ull; i < size; i++) {
auto ridx = ridxs[i];
for (size_t j = 0; j < stride; ++j) {
out[i * stride +j] = in[ridx * stride + j];
}
}
return out;
}
MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
MetaInfo out;
out.num_row_ = ridxs.size();
out.num_col_ = this->num_col_;
// Groups is maintained by a higher level Python function. We should aim at deprecating
// the slice function.
out.labels_.HostVector() = Gather(this->labels_.HostVector(), ridxs);
out.labels_upper_bound_.HostVector() =
Gather(this->labels_upper_bound_.HostVector(), ridxs);
out.labels_lower_bound_.HostVector() =
Gather(this->labels_lower_bound_.HostVector(), ridxs);
// weights
if (this->weights_.Size() + 1 == this->group_ptr_.size()) {
auto& h_weights = out.weights_.HostVector();
// Assuming all groups are available.
out.weights_.HostVector() = h_weights;
} else {
out.weights_.HostVector() = Gather(this->weights_.HostVector(), ridxs);
}
if (this->base_margin_.Size() != this->num_row_) {
CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0)
<< "Incorrect size of base margin vector.";
size_t stride = this->base_margin_.Size() / this->num_row_;
out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs, stride);
} else {
out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs);
}
out.feature_weights.Resize(this->feature_weights.Size());
out.feature_weights.Copy(this->feature_weights);
out.feature_names = this->feature_names;
out.feature_types.Resize(this->feature_types.Size());
out.feature_types.Copy(this->feature_types);
out.feature_type_names = this->feature_type_names;
return out;
}
// try to load group information from file, if exists
inline bool MetaTryLoadGroup(const std::string& fname,
std::vector<unsigned>* group) {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
if (fi == nullptr) return false;
dmlc::istream is(fi.get());
group->clear();
group->push_back(0);
unsigned nline = 0;
while (is >> nline) {
group->push_back(group->back() + nline);
}
return true;
}
// try to load weight information from file, if exists
inline bool MetaTryLoadFloatInfo(const std::string& fname,
std::vector<bst_float>* data) {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
if (fi == nullptr) return false;
dmlc::istream is(fi.get());
data->clear();
bst_float value;
while (is >> value) {
data->push_back(value);
}
return true;
}
// macro to dispatch according to specified pointer types
#define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc) \
switch (dtype) { \
case xgboost::DataType::kFloat32: { \
auto cast_ptr = reinterpret_cast<const float*>(old_ptr); proc; break; \
} \
case xgboost::DataType::kDouble: { \
auto cast_ptr = reinterpret_cast<const double*>(old_ptr); proc; break; \
} \
case xgboost::DataType::kUInt32: { \
auto cast_ptr = reinterpret_cast<const uint32_t*>(old_ptr); proc; break; \
} \
case xgboost::DataType::kUInt64: { \
auto cast_ptr = reinterpret_cast<const uint64_t*>(old_ptr); proc; break; \
} \
default: LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype); \
} \
void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
if (!std::strcmp(key, "label")) {
auto& labels = labels_.HostVector();
labels.resize(num);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, labels.begin()));
auto valid = std::none_of(labels.cbegin(), labels.cend(), [](auto y) {
return std::isnan(y) || std::isinf(y);
});
CHECK(valid) << "Label contains NaN, infinity or a value too large.";
} else if (!std::strcmp(key, "weight")) {
auto& weights = weights_.HostVector();
weights.resize(num);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, weights.begin()));
auto valid = std::none_of(weights.cbegin(), weights.cend(), [](float w) {
return w < 0 || std::isinf(w) || std::isnan(w);
});
CHECK(valid) << "Weights must be positive values.";
} else if (!std::strcmp(key, "base_margin")) {
auto& base_margin = base_margin_.HostVector();
base_margin.resize(num);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
} else if (!std::strcmp(key, "group")) {
group_ptr_.clear(); group_ptr_.resize(num + 1, 0);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1));
group_ptr_[0] = 0;
for (size_t i = 1; i < group_ptr_.size(); ++i) {
group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
}
data::ValidateQueryGroup(group_ptr_);
} else if (!std::strcmp(key, "qid")) {
std::vector<uint32_t> query_ids(num, 0);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, query_ids.begin()));
bool non_dec = true;
for (size_t i = 1; i < query_ids.size(); ++i) {
if (query_ids[i] < query_ids[i-1]) {
non_dec = false;
break;
}
}
CHECK(non_dec) << "`qid` must be sorted in non-decreasing order along with data.";
group_ptr_.clear(); group_ptr_.push_back(0);
for (size_t i = 1; i < query_ids.size(); ++i) {
if (query_ids[i] != query_ids[i-1]) {
group_ptr_.push_back(i);
}
}
if (group_ptr_.back() != query_ids.size()) {
group_ptr_.push_back(query_ids.size());
}
} else if (!std::strcmp(key, "label_lower_bound")) {
auto& labels = labels_lower_bound_.HostVector();
labels.resize(num);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, labels.begin()));
} else if (!std::strcmp(key, "label_upper_bound")) {
auto& labels = labels_upper_bound_.HostVector();
labels.resize(num);
DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, labels.begin()));
} else if (!std::strcmp(key, "feature_weights")) {
auto &h_feature_weights = feature_weights.HostVector();
h_feature_weights.resize(num);
DISPATCH_CONST_PTR(
dtype, dptr, cast_dptr,
std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin()));
bool valid =
std::none_of(h_feature_weights.cbegin(), h_feature_weights.cend(),
[](float w) { return w < 0; });
CHECK(valid) << "Feature weight must be greater than 0.";
} else {
LOG(FATAL) << "Unknown key for MetaInfo: " << key;
}
}
void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
const void **out_dptr) const {
if (dtype == DataType::kFloat32) {
const std::vector<bst_float>* vec = nullptr;
if (!std::strcmp(key, "label")) {
vec = &this->labels_.HostVector();
} else if (!std::strcmp(key, "weight")) {
vec = &this->weights_.HostVector();
} else if (!std::strcmp(key, "base_margin")) {
vec = &this->base_margin_.HostVector();
} else if (!std::strcmp(key, "label_lower_bound")) {
vec = &this->labels_lower_bound_.HostVector();
} else if (!std::strcmp(key, "label_upper_bound")) {
vec = &this->labels_upper_bound_.HostVector();
} else if (!std::strcmp(key, "feature_weights")) {
vec = &this->feature_weights.HostVector();
} else {
LOG(FATAL) << "Unknown float field name: " << key;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size()); // NOLINT
*reinterpret_cast<float const**>(out_dptr) = dmlc::BeginPtr(*vec);
} else if (dtype == DataType::kUInt32) {
const std::vector<unsigned> *vec = nullptr;
if (!std::strcmp(key, "group_ptr")) {
vec = &this->group_ptr_;
} else {
LOG(FATAL) << "Unknown uint32 field name: " << key;
}
*out_len = static_cast<xgboost::bst_ulong>(vec->size());
*reinterpret_cast<unsigned const**>(out_dptr) = dmlc::BeginPtr(*vec);
} else {
LOG(FATAL) << "Unknown data type for getting meta info.";
}
}
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
if (size != 0) {
CHECK_EQ(size, this->num_col_)
<< "Length of " << key << " must be equal to number of columns.";
}
if (!std::strcmp(key, "feature_type")) {
feature_type_names.clear();
auto& h_feature_types = feature_types.HostVector();
for (size_t i = 0; i < size; ++i) {
auto elem = info[i];
feature_type_names.emplace_back(elem);
}
LoadFeatureType(feature_type_names, &h_feature_types);
} else if (!std::strcmp(key, "feature_name")) {
feature_names.clear();
for (size_t i = 0; i < size; ++i) {
feature_names.emplace_back(info[i]);
}
} else {
LOG(FATAL) << "Unknown feature info name: " << key;
}
}
void MetaInfo::GetFeatureInfo(const char *field,
std::vector<std::string> *out_str_vecs) const {
auto &str_vecs = *out_str_vecs;
if (!std::strcmp(field, "feature_type")) {
str_vecs.resize(feature_type_names.size());
std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin());
} else if (!strcmp(field, "feature_name")) {
str_vecs.resize(feature_names.size());
std::copy(feature_names.begin(), feature_names.end(), str_vecs.begin());
} else {
LOG(FATAL) << "Unknown feature info: " << field;
}
}
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_column) {
if (accumulate_rows) {
this->num_row_ += that.num_row_;
}
if (this->num_col_ != 0) {
if (check_column) {
CHECK_EQ(this->num_col_, that.num_col_)
<< "Number of columns must be consistent across batches.";
} else {
this->num_col_ = std::max(this->num_col_, that.num_col_);
}
}
this->num_col_ = that.num_col_;
this->labels_.SetDevice(that.labels_.DeviceIdx());
this->labels_.Extend(that.labels_);
this->weights_.SetDevice(that.weights_.DeviceIdx());
this->weights_.Extend(that.weights_);
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
this->labels_lower_bound_.Extend(that.labels_lower_bound_);
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
this->labels_upper_bound_.Extend(that.labels_upper_bound_);
this->base_margin_.SetDevice(that.base_margin_.DeviceIdx());
this->base_margin_.Extend(that.base_margin_);
if (this->group_ptr_.size() == 0) {
this->group_ptr_ = that.group_ptr_;
} else {
CHECK_NE(that.group_ptr_.size(), 0);
auto group_ptr = that.group_ptr_;
for (size_t i = 1; i < group_ptr.size(); ++i) {
group_ptr[i] += this->group_ptr_.back();
}
this->group_ptr_.insert(this->group_ptr_.end(), group_ptr.begin() + 1,
group_ptr.end());
}
if (!that.feature_names.empty()) {
this->feature_names = that.feature_names;
}
if (!that.feature_type_names.empty()) {
this->feature_type_names = that.feature_type_names;
auto &h_feature_types = feature_types.HostVector();
LoadFeatureType(this->feature_type_names, &h_feature_types);
}
if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size());
this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
this->feature_weights.Copy(that.feature_weights);
}
}
void MetaInfo::Validate(int32_t device) const {
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
<< "Size of weights must equal to number of groups when ranking "
"group is used.";
return;
}
if (group_ptr_.size() != 0) {
CHECK_EQ(group_ptr_.back(), num_row_)
<< "Invalid group structure. Number of rows obtained from groups "
"doesn't equal to actual number of rows given by data.";
}
auto check_device = [device](HostDeviceVector<float> const &v) {
CHECK(v.DeviceIdx() == GenericParameter::kCpuId ||
device == GenericParameter::kCpuId ||
v.DeviceIdx() == device)
<< "Data is resided on a different device than `gpu_id`. "
<< "Device that data is on: " << v.DeviceIdx() << ", "
<< "`gpu_id` for XGBoost: " << device;
};
if (weights_.Size() != 0) {
CHECK_EQ(weights_.Size(), num_row_)
<< "Size of weights must equal to number of rows.";
check_device(weights_);
return;
}
if (labels_.Size() != 0) {
CHECK_EQ(labels_.Size(), num_row_)
<< "Size of labels must equal to number of rows.";
check_device(labels_);
return;
}
if (labels_lower_bound_.Size() != 0) {
CHECK_EQ(labels_lower_bound_.Size(), num_row_)
<< "Size of label_lower_bound must equal to number of rows.";
check_device(labels_lower_bound_);
return;
}
if (feature_weights.Size() != 0) {
CHECK_EQ(feature_weights.Size(), num_col_)
<< "Size of feature_weights must equal to number of columns.";
check_device(feature_weights);
}
if (labels_upper_bound_.Size() != 0) {
CHECK_EQ(labels_upper_bound_.Size(), num_row_)
<< "Size of label_upper_bound must equal to number of rows.";
check_device(labels_upper_bound_);
return;
}
CHECK_LE(num_nonzero_, num_col_ * num_row_);
if (base_margin_.Size() != 0) {
CHECK_EQ(base_margin_.Size() % num_row_, 0)
<< "Size of base margin must be a multiple of number of rows.";
check_device(base_margin_);
}
}
#if !defined(XGBOOST_USE_CUDA)
void MetaInfo::SetInfo(StringView key, std::string const& interface_str) {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)
using DMatrixThreadLocal =
dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
XGBAPIThreadLocalEntry& DMatrix::GetThreadLocal() const {
return (*DMatrixThreadLocal::Get())[this];
}
DMatrix::~DMatrix() {
auto local_map = DMatrixThreadLocal::Get();
if (local_map->find(this) != local_map->cend()) {
local_map->erase(this);
}
}
DMatrix *TryLoadBinary(std::string fname, bool silent) {
int magic;
std::unique_ptr<dmlc::Stream> fi(
dmlc::Stream::Create(fname.c_str(), "r", true));
if (fi != nullptr) {
common::PeekableInStream is(fi.get());
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
if (!DMLC_IO_NO_ENDIAN_SWAP) {
dmlc::ByteSwap(&magic, sizeof(magic), 1);
}
if (magic == data::SimpleDMatrix::kMagic) {
DMatrix *dmat = new data::SimpleDMatrix(&is);
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
<< " matrix with " << dmat->Info().num_nonzero_
<< " entries loaded from " << fname;
}
return dmat;
}
}
}
return nullptr;
}
DMatrix* DMatrix::Load(const std::string& uri,
bool silent,
bool load_row_split,
const std::string& file_format) {
std::string fname, cache_file;
size_t dlm_pos = uri.find('#');
if (dlm_pos != std::string::npos) {
cache_file = uri.substr(dlm_pos + 1, uri.length());
fname = uri.substr(0, dlm_pos);
CHECK_EQ(cache_file.find('#'), std::string::npos)
<< "Only one `#` is allowed in file path for cache file specification.";
if (load_row_split) {
std::ostringstream os;
std::vector<std::string> cache_shards = common::Split(cache_file, ':');
for (size_t i = 0; i < cache_shards.size(); ++i) {
size_t pos = cache_shards[i].rfind('.');
if (pos == std::string::npos) {
os << cache_shards[i]
<< ".r" << rabit::GetRank()
<< "-" << rabit::GetWorldSize();
} else {
os << cache_shards[i].substr(0, pos)
<< ".r" << rabit::GetRank()
<< "-" << rabit::GetWorldSize()
<< cache_shards[i].substr(pos, cache_shards[i].length());
}
if (i + 1 != cache_shards.size()) {
os << ':';
}
}
cache_file = os.str();
}
} else {
fname = uri;
}
int partid = 0, npart = 1;
if (load_row_split) {
partid = rabit::GetRank();
npart = rabit::GetWorldSize();
} else {
// test option to load in part
npart = dmlc::GetEnv("XGBOOST_TEST_NPART", 1);
}
if (npart != 1) {
LOG(CONSOLE) << "Load part of data " << partid
<< " of " << npart << " parts";
}
// legacy handling of binary data loading
if (file_format == "auto" && npart == 1) {
DMatrix *loaded = TryLoadBinary(fname, silent);
if (loaded) {
return loaded;
}
}
DMatrix* dmat {nullptr};
try {
if (cache_file.empty()) {
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart,
file_format.c_str()));
data::FileAdapter adapter(parser.get());
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(),
1, cache_file);
} else {
data::FileIterator iter{fname, uint32_t(partid), uint32_t(npart),
file_format};
dmat = new data::SparsePageDMatrix{
&iter,
iter.Proxy(),
data::fileiter::Reset,
data::fileiter::Next,
std::numeric_limits<float>::quiet_NaN(),
1,
cache_file};
}
} catch (dmlc::Error &e) {
std::vector<std::string> splited = common::Split(fname, '#');
std::vector<std::string> args = common::Split(splited.front(), '?');
std::string format {file_format};
if (args.size() == 1 && file_format == "auto") {
auto extension = common::Split(args.front(), '.').back();
if (extension == "csv" || extension == "libsvm") {
format = extension;
}
if (format == extension) {
LOG(WARNING)
<< "No format parameter is provided in input uri, but found file extension: "
<< format << " . "
<< "Consider providing a uri parameter: filename?format=" << format;
} else {
LOG(WARNING)
<< "No format parameter is provided in input uri. "
<< "Choosing default parser in dmlc-core. "
<< "Consider providing a uri parameter like: filename?format=csv";
}
}
LOG(FATAL) << "Encountered parser error:\n" << e.what();
}
/* sync up number of features after matrix loaded.
* partitioned data will fail the train/val validation check
* since partitioned data not knowing the real number of features. */
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
// backward compatiblity code.
if (!load_row_split) {
MetaInfo& info = dmat->Info();
if (MetaTryLoadGroup(fname + ".group", &info.group_ptr_) && !silent) {
LOG(CONSOLE) << info.group_ptr_.size() - 1
<< " groups are loaded from " << fname << ".group";
}
if (MetaTryLoadFloatInfo
(fname + ".base_margin", &info.base_margin_.HostVector()) && !silent) {
LOG(CONSOLE) << info.base_margin_.Size()
<< " base_margin are loaded from " << fname << ".base_margin";
}
if (MetaTryLoadFloatInfo
(fname + ".weight", &info.weights_.HostVector()) && !silent) {
LOG(CONSOLE) << info.weights_.Size()
<< " weights are loaded from " << fname << ".weight";
}
}
return dmat;
}
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int nthread,
int max_bin) {
return new data::IterativeDeviceDMatrix(iter, proxy, reset, next, missing,
nthread, max_bin);
}
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t n_threads,
std::string cache) {
return new data::SparsePageDMatrix(iter, proxy, reset, next, missing, n_threads,
cache);
}
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
DataIterResetCallback, XGDMatrixCallbackNext>(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int nthread,
int max_bin);
template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
DataIterResetCallback, XGDMatrixCallbackNext>(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
const std::string& cache_prefix) {
return new data::SimpleDMatrix(adapter, missing, nthread);
}
template DMatrix* DMatrix::Create<data::DenseAdapter>(
data::DenseAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
data::ArrayAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRAdapter>(
data::CSRAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSCAdapter>(
data::CSCAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::DataTableAdapter>(
data::DataTableAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::FileAdapter>(
data::FileAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(
data::CSRArrayAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix);
template DMatrix *
DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
XGBoostBatchCSR> *adapter,
float missing, int nthread, const std::string &cache_prefix);
SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose;
common::ParallelGroupBuilder<Entry, bst_row_t> builder(&transpose.offset.HostVector(),
&transpose.data.HostVector());
const int nthread = omp_get_max_threads();
builder.InitBudget(num_columns, nthread);
long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
auto page = this->GetView();
common::ParallelFor(batch_size, [&](long i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = page[i];
for (const auto& entry : inst) {
builder.AddBudget(entry.index, tid);
}
});
builder.InitStorage();
common::ParallelFor(batch_size, [&](long i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = page[i];
for (const auto& entry : inst) {
builder.Push(
entry.index,
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
tid);
}
});
if (this->data.Empty()) {
transpose.offset.Resize(num_columns + 1);
transpose.offset.Fill(0);
}
CHECK_EQ(transpose.offset.Size(), num_columns + 1);
return transpose;
}
void SparsePage::Push(const SparsePage &batch) {
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
const auto& batch_offset_vec = batch.offset.HostVector();
const auto& batch_data_vec = batch.data.HostVector();
size_t top = offset_vec.back();
data_vec.resize(top + batch.data.Size());
if (dmlc::BeginPtr(data_vec) && dmlc::BeginPtr(batch_data_vec)) {
std::memcpy(dmlc::BeginPtr(data_vec) + top, dmlc::BeginPtr(batch_data_vec),
sizeof(Entry) * batch.data.Size());
}
size_t begin = offset.Size();
offset_vec.resize(begin + batch.Size());
for (size_t i = 0; i < batch.Size(); ++i) {
offset_vec[i + begin] = top + batch_offset_vec[i + 1];
}
}
template <typename AdapterBatchT>
uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread) {
constexpr bool kIsRowMajor = AdapterBatchT::kIsRowMajor;
// Allow threading only for row-major case as column-major requires O(nthread*batch_size) memory
nthread = kIsRowMajor ? nthread : 1;
// Set number of threads but keep old value so we can reset it after
int nthread_original = common::OmpSetNumThreadsWithoutHT(&nthread);
if (!kIsRowMajor) {
CHECK_EQ(nthread, 1);
}
auto& offset_vec = offset.HostVector();
auto& data_vec = data.HostVector();
size_t builder_base_row_offset = this->Size();
common::ParallelGroupBuilder<
Entry, std::remove_reference<decltype(offset_vec)>::type::value_type, kIsRowMajor>
builder(&offset_vec, &data_vec, builder_base_row_offset);
// Estimate expected number of rows by using last element in batch
// This is not required to be exact but prevents unnecessary resizing
size_t expected_rows = 0;
if (batch.Size() > 0) {
auto last_line = batch.GetLine(batch.Size() - 1);
if (last_line.Size() > 0) {
expected_rows =
last_line.GetElement(last_line.Size() - 1).row_idx - base_rowid;
}
}
size_t batch_size = batch.Size();
expected_rows = kIsRowMajor ? batch_size : expected_rows;
uint64_t max_columns = 0;
if (batch_size == 0) {
omp_set_num_threads(nthread_original);
return max_columns;
}
const size_t thread_size = batch_size / nthread;
builder.InitBudget(expected_rows, nthread);
std::vector<std::vector<uint64_t>> max_columns_vector(nthread, std::vector<uint64_t>{0});
dmlc::OMPException exec;
std::atomic<bool> valid{true};
// First-pass over the batch counting valid elements
#pragma omp parallel num_threads(nthread)
{
exec.Run([&]() {
int tid = omp_get_thread_num();
size_t begin = tid*thread_size;
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
uint64_t& max_columns_local = max_columns_vector[tid][0];
for (size_t i = begin; i < end; ++i) {
auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) {
data::COOTuple const& element = line.GetElement(j);
if (!std::isinf(missing) && std::isinf(element.value)) {
valid = false;
}
const size_t key = element.row_idx - base_rowid;
CHECK_GE(key, builder_base_row_offset);
max_columns_local =
std::max(max_columns_local, static_cast<uint64_t>(element.column_idx + 1));
if (!common::CheckNAN(element.value) && element.value != missing) {
// Adapter row index is absolute, here we want it relative to
// current page
builder.AddBudget(key, tid);
}
}
}
});
}
exec.Rethrow();
CHECK(valid) << "Input data contains `inf` or `nan`";
for (const auto & max : max_columns_vector) {
max_columns = std::max(max_columns, max[0]);
}
builder.InitStorage();
// Second pass over batch, placing elements in correct position
auto is_valid = data::IsValidFunctor{missing};
#pragma omp parallel num_threads(nthread)
{
exec.Run([&]() {
int tid = omp_get_thread_num();
size_t begin = tid * thread_size;
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
for (size_t i = begin; i < end; ++i) {
auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j);
const size_t key = (element.row_idx - base_rowid);
if (is_valid(element)) {
builder.Push(key, Entry(element.column_idx, element.value), tid);
}
}
}
});
}
exec.Rethrow();
omp_set_num_threads(nthread_original);
return max_columns;
}
void SparsePage::PushCSC(const SparsePage &batch) {
std::vector<xgboost::Entry>& self_data = data.HostVector();
std::vector<bst_row_t>& self_offset = offset.HostVector();
auto const& other_data = batch.data.ConstHostVector();
auto const& other_offset = batch.offset.ConstHostVector();
if (other_data.empty()) {
self_offset = other_offset;
return;
}
if (!self_data.empty()) {
CHECK_EQ(self_offset.size(), other_offset.size())
<< "self_data.size(): " << this->data.Size() << ", "
<< "other_data.size(): " << other_data.size() << std::flush;
} else {
self_data = other_data;
self_offset = other_offset;
return;
}
std::vector<bst_row_t> offset(other_offset.size());
offset[0] = 0;
std::vector<xgboost::Entry> data(self_data.size() + other_data.size());
// n_cols in original csr data matrix, here in csc is n_rows
size_t const n_features = other_offset.size() - 1;
size_t beg = 0;
size_t ptr = 1;
for (size_t i = 0; i < n_features; ++i) {
size_t const self_beg = self_offset.at(i);
size_t const self_length = self_offset.at(i+1) - self_beg;
// It is possible that the current feature and further features aren't referenced
// in any rows accumulated thus far. It is also possible for this to happen
// in the current sparse page row batch as well.
// Hence, the incremental number of rows may stay constant thus equaling the data size
CHECK_LE(beg, data.size());
std::memcpy(dmlc::BeginPtr(data)+beg,
dmlc::BeginPtr(self_data) + self_beg,
sizeof(Entry) * self_length);
beg += self_length;
size_t const other_beg = other_offset.at(i);
size_t const other_length = other_offset.at(i+1) - other_beg;
CHECK_LE(beg, data.size());
std::memcpy(dmlc::BeginPtr(data)+beg,
dmlc::BeginPtr(other_data) + other_beg,
sizeof(Entry) * other_length);
beg += other_length;
CHECK_LT(ptr, offset.size());
offset.at(ptr) = beg;
ptr++;
}
self_data = std::move(data);
self_offset = std::move(offset);
}
template uint64_t
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
namespace data {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
DMLC_REGISTRY_LINK_TAG(gradient_index_format);
} // namespace data
} // namespace xgboost