/** * Copyright 2015-2023 by XGBoost Contributors * \file data.cc */ #include "xgboost/data.h" #include #include #include #include #include "../collective/communicator-inl.h" #include "../collective/communicator.h" #include "../common/algorithm.h" // for StableSort #include "../common/api_entry.h" // for XGBAPIThreadLocalEntry #include "../common/common.h" #include "../common/error_msg.h" // for InfInData, GroupWeight, GroupSize #include "../common/group_data.h" #include "../common/io.h" #include "../common/linalg_op.h" #include "../common/math.h" #include "../common/numeric.h" // for Iota #include "../common/threading_utils.h" #include "../common/version.h" #include "../data/adapter.h" #include "../data/iterative_dmatrix.h" #include "./sparse_page_dmatrix.h" #include "./sparse_page_source.h" #include "dmlc/io.h" #include "file_iterator.h" #include "simple_dmatrix.h" #include "sparse_page_writer.h" #include "validation.h" #include "xgboost/c_api.h" #include "xgboost/context.h" #include "xgboost/host_device_vector.h" #include "xgboost/learner.h" #include "xgboost/linalg.h" // Vector #include "xgboost/logging.h" #include "xgboost/string_view.h" #include "xgboost/version_config.h" namespace dmlc { DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::GHistIndexMatrix>); } // namespace dmlc namespace { template void SaveScalarField(dmlc::Stream *strm, const std::string &name, xgboost::DataType type, const T &field) { strm->Write(name); strm->Write(static_cast(type)); strm->Write(true); // is_scalar=True strm->Write(field); } template void SaveVectorField(dmlc::Stream *strm, const std::string &name, xgboost::DataType type, std::pair shape, const std::vector& field) { strm->Write(name); strm->Write(static_cast(type)); strm->Write(false); // is_scalar=False strm->Write(shape.first); strm->Write(shape.second); strm->Write(field); } template void SaveVectorField(dmlc::Stream* strm, const std::string& name, xgboost::DataType type, std::pair shape, const xgboost::HostDeviceVector& field) { SaveVectorField(strm, name, type, shape, field.ConstHostVector()); } template void SaveTensorField(dmlc::Stream* strm, const std::string& name, xgboost::DataType type, const xgboost::linalg::Tensor& field) { strm->Write(name); strm->Write(static_cast(type)); strm->Write(false); // is_scalar=False for (size_t i = 0; i < D; ++i) { strm->Write(field.Shape(i)); } strm->Write(field.Data()->HostVector()); } template void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, T* field) { const std::string invalid{"MetaInfo: Invalid format for " + expected_name}; std::string name; xgboost::DataType type; bool is_scalar; CHECK(strm->Read(&name)) << invalid; CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name << ", got: " << name; uint8_t type_val; CHECK(strm->Read(&type_val)) << invalid; type = static_cast(type_val); CHECK(type == expected_type) << invalid << "Expected field of type: " << static_cast(expected_type) << ", " << "got field type: " << static_cast(type); CHECK(strm->Read(&is_scalar)) << invalid; CHECK(is_scalar) << invalid << "Expected field " << expected_name << " to be a scalar; got a vector"; CHECK(strm->Read(field)) << invalid; } template void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, std::vector* field) { const std::string invalid{"MetaInfo: Invalid format for " + expected_name}; std::string name; xgboost::DataType type; bool is_scalar; CHECK(strm->Read(&name)) << invalid; CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name << ", got: " << name; uint8_t type_val; CHECK(strm->Read(&type_val)) << invalid; type = static_cast(type_val); CHECK(type == expected_type) << invalid << "Expected field of type: " << static_cast(expected_type) << ", " << "got field type: " << static_cast(type); CHECK(strm->Read(&is_scalar)) << invalid; CHECK(!is_scalar) << invalid << "Expected field " << expected_name << " to be a vector; got a scalar"; std::pair shape; CHECK(strm->Read(&shape.first)); CHECK(strm->Read(&shape.second)); // TODO(hcho3): this restriction may be lifted, once we add a field with more than 1 column. CHECK_EQ(shape.second, 1) << invalid << "Number of columns is expected to be 1."; CHECK(strm->Read(field)) << invalid; } template void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, xgboost::HostDeviceVector* field) { LoadVectorField(strm, expected_name, expected_type, &field->HostVector()); } template void LoadTensorField(dmlc::Stream* strm, std::string const& expected_name, xgboost::DataType expected_type, xgboost::linalg::Tensor* p_out) { const std::string invalid{"MetaInfo: Invalid format for " + expected_name}; std::string name; xgboost::DataType type; bool is_scalar; CHECK(strm->Read(&name)) << invalid; CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name << ", got: " << name; uint8_t type_val; CHECK(strm->Read(&type_val)) << invalid; type = static_cast(type_val); CHECK(type == expected_type) << invalid << "Expected field of type: " << static_cast(expected_type) << ", " << "got field type: " << static_cast(type); CHECK(strm->Read(&is_scalar)) << invalid; CHECK(!is_scalar) << invalid << "Expected field " << expected_name << " to be a tensor; got a scalar"; size_t shape[D]; for (size_t i = 0; i < D; ++i) { CHECK(strm->Read(&(shape[i]))); } p_out->Reshape(shape); auto& field = p_out->Data()->HostVector(); CHECK(strm->Read(&field)) << invalid; } } // anonymous namespace namespace xgboost { uint64_t constexpr MetaInfo::kNumField; // implementation of inline functions void MetaInfo::Clear() { num_row_ = num_col_ = num_nonzero_ = 0; labels = decltype(labels){}; group_ptr_.clear(); weights_.HostVector().clear(); base_margin_ = decltype(base_margin_){}; } /* * Binary serialization format for MetaInfo: * * | name | type | is_scalar | num_row | num_col | value | * |--------------------+----------+-----------+-------------+-------------+------------------------| * | num_row | kUInt64 | True | NA | NA | ${num_row_} | * | num_col | kUInt64 | True | NA | NA | ${num_col_} | * | num_nonzero | kUInt64 | True | NA | NA | ${num_nonzero_} | * | labels | kFloat32 | False | ${size} | 1 | ${labels_} | * | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} | * | weights | kFloat32 | False | ${size} | 1 | ${weights_} | * | base_margin | kFloat32 | False | ${Shape(0)} | ${Shape(1)} | ${base_margin_} | * | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} | * | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} | * | feature_names | kStr | False | ${size} | 1 | ${feature_names} | * | feature_types | kStr | False | ${size} | 1 | ${feature_types} | * | feature_weights | kFloat32 | False | ${size} | 1 | ${feature_weights} | * * Note that the scalar fields (is_scalar=True) will have num_row and num_col missing. * Also notice the difference between the saved name and the name used in `SetInfo': * the former uses the plural form. */ void MetaInfo::SaveBinary(dmlc::Stream *fo) const { Version::Save(fo); fo->Write(kNumField); int field_cnt = 0; // make sure we are actually writing kNumField fields SaveScalarField(fo, u8"num_row", DataType::kUInt64, num_row_); ++field_cnt; SaveScalarField(fo, u8"num_col", DataType::kUInt64, num_col_); ++field_cnt; SaveScalarField(fo, u8"num_nonzero", DataType::kUInt64, num_nonzero_); ++field_cnt; SaveTensorField(fo, u8"labels", DataType::kFloat32, labels); ++field_cnt; SaveVectorField(fo, u8"group_ptr", DataType::kUInt32, {group_ptr_.size(), 1}, group_ptr_); ++field_cnt; SaveVectorField(fo, u8"weights", DataType::kFloat32, {weights_.Size(), 1}, weights_); ++field_cnt; SaveTensorField(fo, u8"base_margin", DataType::kFloat32, base_margin_); ++field_cnt; SaveVectorField(fo, u8"labels_lower_bound", DataType::kFloat32, {labels_lower_bound_.Size(), 1}, labels_lower_bound_); ++field_cnt; SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32, {labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt; SaveVectorField(fo, u8"feature_names", DataType::kStr, {feature_names.size(), 1}, feature_names); ++field_cnt; SaveVectorField(fo, u8"feature_types", DataType::kStr, {feature_type_names.size(), 1}, feature_type_names); ++field_cnt; SaveVectorField(fo, u8"feature_weights", DataType::kFloat32, {feature_weights.Size(), 1}, feature_weights); ++field_cnt; CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields"; } void LoadFeatureType(std::vectorconst& type_names, std::vector* types) { types->clear(); for (auto const &elem : type_names) { if (elem == "int") { types->emplace_back(FeatureType::kNumerical); } else if (elem == "float") { types->emplace_back(FeatureType::kNumerical); } else if (elem == "i") { types->emplace_back(FeatureType::kNumerical); } else if (elem == "q") { types->emplace_back(FeatureType::kNumerical); } else if (elem == "c") { types->emplace_back(FeatureType::kCategorical); } else { LOG(FATAL) << "All feature_types must be one of {int, float, i, q, c}."; } } } const std::vector& MetaInfo::LabelAbsSort(Context const* ctx) const { if (label_order_cache_.size() == labels.Size()) { return label_order_cache_; } label_order_cache_.resize(labels.Size()); common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0); const auto& l = labels.Data()->HostVector(); common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(), [&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); }); return label_order_cache_; } void MetaInfo::LoadBinary(dmlc::Stream *fi) { auto version = Version::Load(fi); auto major = std::get<0>(version); // MetaInfo is saved in `SparsePageSource'. So the version in MetaInfo represents the // version of DMatrix. std::stringstream msg; msg << "Binary DMatrix generated by XGBoost: " << Version::String(version) << " is no longer supported. " << "Please process and save your data in current version: " << Version::String(Version::Self()) << " again."; CHECK_GE(major, 1) << msg.str(); if (major == 1) { auto minor = std::get<1>(version); CHECK_GE(minor, 6) << msg.str(); } const uint64_t expected_num_field = kNumField; uint64_t num_field { 0 }; CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format"; size_t expected = 0; if (major == 1 && std::get<1>(version) < 2) { // feature names and types are added in 1.2 expected = expected_num_field - 2; } else { expected = expected_num_field; } CHECK_GE(num_field, expected) << "MetaInfo: insufficient number of fields (expected at least " << expected << " fields, but the binary file only contains " << num_field << "fields.)"; if (num_field > expected_num_field) { LOG(WARNING) << "MetaInfo: the given binary file contains extra fields " "which will be ignored."; } LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_); LoadScalarField(fi, u8"num_col", DataType::kUInt64, &num_col_); LoadScalarField(fi, u8"num_nonzero", DataType::kUInt64, &num_nonzero_); LoadTensorField(fi, u8"labels", DataType::kFloat32, &labels); LoadVectorField(fi, u8"group_ptr", DataType::kUInt32, &group_ptr_); LoadVectorField(fi, u8"weights", DataType::kFloat32, &weights_); LoadTensorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_); LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_); LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_); LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names); LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names); LoadVectorField(fi, u8"feature_weights", DataType::kFloat32, &feature_weights); LoadFeatureType(feature_type_names, &feature_types.HostVector()); } template std::vector Gather(const std::vector &in, common::Span ridxs, size_t stride = 1) { if (in.empty()) { return {}; } auto size = ridxs.size(); std::vector out(size * stride); for (auto i = 0ull; i < size; i++) { auto ridx = ridxs[i]; for (size_t j = 0; j < stride; ++j) { out[i * stride +j] = in[ridx * stride + j]; } } return out; } MetaInfo MetaInfo::Slice(common::Span ridxs) const { MetaInfo out; out.num_row_ = ridxs.size(); out.num_col_ = this->num_col_; // Groups is maintained by a higher level Python function. We should aim at deprecating // the slice function. if (this->labels.Size() != this->num_row_) { auto t_labels = this->labels.View(this->labels.Data()->DeviceIdx()); out.labels.Reshape(ridxs.size(), labels.Shape(1)); out.labels.Data()->HostVector() = Gather(this->labels.Data()->HostVector(), ridxs, t_labels.Stride(0)); } else { out.labels.ModifyInplace([&](auto* data, common::Span shape) { data->HostVector() = Gather(this->labels.Data()->HostVector(), ridxs); shape[0] = data->Size(); shape[1] = 1; }); } out.labels_upper_bound_.HostVector() = Gather(this->labels_upper_bound_.HostVector(), ridxs); out.labels_lower_bound_.HostVector() = Gather(this->labels_lower_bound_.HostVector(), ridxs); // weights if (this->weights_.Size() + 1 == this->group_ptr_.size()) { auto& h_weights = out.weights_.HostVector(); // Assuming all groups are available. out.weights_.HostVector() = h_weights; } else { out.weights_.HostVector() = Gather(this->weights_.HostVector(), ridxs); } if (this->base_margin_.Size() != this->num_row_) { CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0) << "Incorrect size of base margin vector."; auto t_margin = this->base_margin_.View(this->base_margin_.Data()->DeviceIdx()); out.base_margin_.Reshape(ridxs.size(), t_margin.Shape(1)); out.base_margin_.Data()->HostVector() = Gather(this->base_margin_.Data()->HostVector(), ridxs, t_margin.Stride(0)); } else { out.base_margin_.ModifyInplace([&](auto* data, common::Span shape) { data->HostVector() = Gather(this->base_margin_.Data()->HostVector(), ridxs); shape[0] = data->Size(); shape[1] = 1; }); } out.feature_weights.Resize(this->feature_weights.Size()); out.feature_weights.Copy(this->feature_weights); out.feature_names = this->feature_names; out.feature_types.Resize(this->feature_types.Size()); out.feature_types.Copy(this->feature_types); out.feature_type_names = this->feature_type_names; return out; } MetaInfo MetaInfo::Copy() const { MetaInfo out; out.Extend(*this, /*accumulate_rows=*/true, /*check_column=*/false); return out; } namespace { template void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor* p_out) { ArrayInterface array{arr_interface}; if (array.n == 0) { p_out->Reshape(array.shape); return; } CHECK_EQ(array.valid.Capacity(), 0) << "Meta info like label or weight can not have missing value."; if (array.is_contiguous && array.type == ToDType::kType) { // Handle contigious p_out->ModifyInplace([&](HostDeviceVector* data, common::Span shape) { // set shape std::copy(array.shape, array.shape + D, shape.data()); // set data data->Resize(array.n); std::memcpy(data->HostPointer(), array.data, array.n * sizeof(T)); }); return; } p_out->Reshape(array.shape); auto t_out = p_out->View(Context::kCpuId); CHECK(t_out.CContiguous()); auto const shape = t_out.Shape(); DispatchDType(array, Context::kCpuId, [&](auto&& in) { linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) { return std::apply(in, linalg::UnravelIndex(i, shape)); }); }); } } // namespace void MetaInfo::SetInfo(Context const& ctx, StringView key, StringView interface_str) { Json j_interface = Json::Load(interface_str); bool is_cuda{false}; if (IsA(j_interface)) { auto const& array = get(j_interface); CHECK_GE(array.size(), 0) << "Invalid " << key << ", must have at least 1 column even if it's empty."; auto const& first = get