/*! * Copyright 2015-2020 by Contributors * \file data.cc */ #include #include #include "dmlc/io.h" #include "xgboost/data.h" #include "xgboost/c_api.h" #include "xgboost/host_device_vector.h" #include "xgboost/logging.h" #include "xgboost/version_config.h" #include "sparse_page_writer.h" #include "simple_dmatrix.h" #include "../common/io.h" #include "../common/math.h" #include "../common/version.h" #include "../common/group_data.h" #include "../data/adapter.h" #if DMLC_ENABLE_STD_THREAD #include "./sparse_page_source.h" #include "./sparse_page_dmatrix.h" #endif // DMLC_ENABLE_STD_THREAD namespace dmlc { DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>); DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>); } // namespace dmlc namespace { template void SaveScalarField(dmlc::Stream *strm, const std::string &name, xgboost::DataType type, const T &field) { strm->Write(name); strm->Write(static_cast(type)); strm->Write(true); // is_scalar=True strm->Write(field); } template void SaveVectorField(dmlc::Stream *strm, const std::string &name, xgboost::DataType type, std::pair shape, const std::vector& field) { strm->Write(name); strm->Write(static_cast(type)); strm->Write(false); // is_scalar=False strm->Write(shape.first); strm->Write(shape.second); strm->Write(field); } template void SaveVectorField(dmlc::Stream* strm, const std::string& name, xgboost::DataType type, std::pair shape, const xgboost::HostDeviceVector& field) { SaveVectorField(strm, name, type, shape, field.ConstHostVector()); } template void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, T* field) { const std::string invalid {"MetaInfo: Invalid format. "}; std::string name; xgboost::DataType type; bool is_scalar; CHECK(strm->Read(&name)) << invalid; CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name << ", got: " << name; uint8_t type_val; CHECK(strm->Read(&type_val)) << invalid; type = static_cast(type_val); CHECK(type == expected_type) << invalid << "Expected field of type: " << static_cast(expected_type) << ", " << "got field type: " << static_cast(type); CHECK(strm->Read(&is_scalar)) << invalid; CHECK(is_scalar) << invalid << "Expected field " << expected_name << " to be a scalar; got a vector"; CHECK(strm->Read(field, sizeof(T))) << invalid; } template void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, std::vector* field) { const std::string invalid {"MetaInfo: Invalid format. "}; std::string name; xgboost::DataType type; bool is_scalar; CHECK(strm->Read(&name)) << invalid; CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name << ", got: " << name; uint8_t type_val; CHECK(strm->Read(&type_val)) << invalid; type = static_cast(type_val); CHECK(type == expected_type) << invalid << "Expected field of type: " << static_cast(expected_type) << ", " << "got field type: " << static_cast(type); CHECK(strm->Read(&is_scalar)) << invalid; CHECK(!is_scalar) << invalid << "Expected field " << expected_name << " to be a vector; got a scalar"; std::pair shape; CHECK(strm->Read(&shape.first)); CHECK(strm->Read(&shape.second)); // TODO(hcho3): this restriction may be lifted, once we add a field with more than 1 column. CHECK_EQ(shape.second, 1) << invalid << "Number of columns is expected to be 1."; CHECK(strm->Read(field)) << invalid; } template void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name, xgboost::DataType expected_type, xgboost::HostDeviceVector* field) { LoadVectorField(strm, expected_name, expected_type, &field->HostVector()); } } // anonymous namespace namespace xgboost { uint64_t constexpr MetaInfo::kNumField; // implementation of inline functions void MetaInfo::Clear() { num_row_ = num_col_ = num_nonzero_ = 0; labels_.HostVector().clear(); group_ptr_.clear(); weights_.HostVector().clear(); base_margin_.HostVector().clear(); } /* * Binary serialization format for MetaInfo: * * | name | type | is_scalar | num_row | num_col | value | * |--------------------+----------+-----------+---------+---------+-------------------------| * | num_row | kUInt64 | True | NA | NA | ${num_row_} | * | num_col | kUInt64 | True | NA | NA | ${num_col_} | * | num_nonzero | kUInt64 | True | NA | NA | ${num_nonzero_} | * | labels | kFloat32 | False | ${size} | 1 | ${labels_} | * | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} | * | weights | kFloat32 | False | ${size} | 1 | ${weights_} | * | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} | * | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound__} | * | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound__} | * * Note that the scalar fields (is_scalar=True) will have num_row and num_col missing. * Also notice the difference between the saved name and the name used in `SetInfo': * the former uses the plural form. */ void MetaInfo::SaveBinary(dmlc::Stream *fo) const { Version::Save(fo); fo->Write(kNumField); int field_cnt = 0; // make sure we are actually writing kNumField fields SaveScalarField(fo, u8"num_row", DataType::kUInt64, num_row_); ++field_cnt; SaveScalarField(fo, u8"num_col", DataType::kUInt64, num_col_); ++field_cnt; SaveScalarField(fo, u8"num_nonzero", DataType::kUInt64, num_nonzero_); ++field_cnt; SaveVectorField(fo, u8"labels", DataType::kFloat32, {labels_.Size(), 1}, labels_); ++field_cnt; SaveVectorField(fo, u8"group_ptr", DataType::kUInt32, {group_ptr_.size(), 1}, group_ptr_); ++field_cnt; SaveVectorField(fo, u8"weights", DataType::kFloat32, {weights_.Size(), 1}, weights_); ++field_cnt; SaveVectorField(fo, u8"base_margin", DataType::kFloat32, {base_margin_.Size(), 1}, base_margin_); ++field_cnt; SaveVectorField(fo, u8"labels_lower_bound", DataType::kFloat32, {labels_lower_bound_.Size(), 1}, labels_lower_bound_); ++field_cnt; SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32, {labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt; CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields"; } void MetaInfo::LoadBinary(dmlc::Stream *fi) { auto version = Version::Load(fi); auto major = std::get<0>(version); // MetaInfo is saved in `SparsePageSource'. So the version in MetaInfo represents the // version of DMatrix. CHECK_EQ(major, 1) << "Binary DMatrix generated by XGBoost: " << Version::String(version) << " is no longer supported. " << "Please process and save your data in current version: " << Version::String(Version::Self()) << " again."; const uint64_t expected_num_field = kNumField; uint64_t num_field { 0 }; CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format"; CHECK_GE(num_field, expected_num_field) << "MetaInfo: insufficient number of fields (expected at least " << expected_num_field << " fields, but the binary file only contains " << num_field << "fields.)"; if (num_field > expected_num_field) { LOG(WARNING) << "MetaInfo: the given binary file contains extra fields which will be ignored."; } LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_); LoadScalarField(fi, u8"num_col", DataType::kUInt64, &num_col_); LoadScalarField(fi, u8"num_nonzero", DataType::kUInt64, &num_nonzero_); LoadVectorField(fi, u8"labels", DataType::kFloat32, &labels_); LoadVectorField(fi, u8"group_ptr", DataType::kUInt32, &group_ptr_); LoadVectorField(fi, u8"weights", DataType::kFloat32, &weights_); LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_); LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_); LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_); } template std::vector Gather(const std::vector &in, common::Span ridxs, size_t stride = 1) { if (in.empty()) { return {}; } auto size = ridxs.size(); std::vector out(size * stride); for (auto i = 0ull; i < size; i++) { auto ridx = ridxs[i]; for (size_t j = 0; j < stride; ++j) { out[i * stride +j] = in[ridx * stride + j]; } } return out; } MetaInfo MetaInfo::Slice(common::Span ridxs) const { MetaInfo out; out.num_row_ = ridxs.size(); out.num_col_ = this->num_col_; // Groups is maintained by a higher level Python function. We should aim at deprecating // the slice function. out.labels_.HostVector() = Gather(this->labels_.HostVector(), ridxs); out.labels_upper_bound_.HostVector() = Gather(this->labels_upper_bound_.HostVector(), ridxs); out.labels_lower_bound_.HostVector() = Gather(this->labels_lower_bound_.HostVector(), ridxs); // weights if (this->weights_.Size() + 1 == this->group_ptr_.size()) { auto& h_weights = out.weights_.HostVector(); // Assuming all groups are available. out.weights_.HostVector() = h_weights; } else { out.weights_.HostVector() = Gather(this->weights_.HostVector(), ridxs); } if (this->base_margin_.Size() != this->num_row_) { CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0) << "Incorrect size of base margin vector."; size_t stride = this->base_margin_.Size() / this->num_row_; out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs, stride); } else { out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs); } return out; } // try to load group information from file, if exists inline bool MetaTryLoadGroup(const std::string& fname, std::vector* group) { std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r", true)); if (fi == nullptr) return false; dmlc::istream is(fi.get()); group->clear(); group->push_back(0); unsigned nline = 0; while (is >> nline) { group->push_back(group->back() + nline); } return true; } // try to load weight information from file, if exists inline bool MetaTryLoadFloatInfo(const std::string& fname, std::vector* data) { std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r", true)); if (fi == nullptr) return false; dmlc::istream is(fi.get()); data->clear(); bst_float value; while (is >> value) { data->push_back(value); } return true; } // macro to dispatch according to specified pointer types #define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc) \ switch (dtype) { \ case xgboost::DataType::kFloat32: { \ auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ } \ case xgboost::DataType::kDouble: { \ auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ } \ case xgboost::DataType::kUInt32: { \ auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ } \ case xgboost::DataType::kUInt64: { \ auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ } \ default: LOG(FATAL) << "Unknown data type" << static_cast(dtype); \ } \ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) { if (!std::strcmp(key, "label")) { auto& labels = labels_.HostVector(); labels.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, labels.begin())); } else if (!std::strcmp(key, "weight")) { auto& weights = weights_.HostVector(); weights.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, weights.begin())); } else if (!std::strcmp(key, "base_margin")) { auto& base_margin = base_margin_.HostVector(); base_margin.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, base_margin.begin())); } else if (!std::strcmp(key, "group")) { group_ptr_.resize(num + 1); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1)); group_ptr_[0] = 0; for (size_t i = 1; i < group_ptr_.size(); ++i) { group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i]; } } else if (!std::strcmp(key, "label_lower_bound")) { auto& labels = labels_lower_bound_.HostVector(); labels.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, labels.begin())); } else if (!std::strcmp(key, "label_upper_bound")) { auto& labels = labels_upper_bound_.HostVector(); labels.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, labels.begin())); } else { LOG(FATAL) << "Unknown key for MetaInfo: " << key; } } void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { if (accumulate_rows) { this->num_row_ += that.num_row_; } if (this->num_col_ != 0) { CHECK_EQ(this->num_col_, that.num_col_) << "Number of columns must be consistent across batches."; } this->num_col_ = that.num_col_; this->labels_.SetDevice(that.labels_.DeviceIdx()); this->labels_.Extend(that.labels_); this->weights_.SetDevice(that.weights_.DeviceIdx()); this->weights_.Extend(that.weights_); this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx()); this->labels_lower_bound_.Extend(that.labels_lower_bound_); this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx()); this->labels_upper_bound_.Extend(that.labels_upper_bound_); this->base_margin_.SetDevice(that.base_margin_.DeviceIdx()); this->base_margin_.Extend(that.base_margin_); if (this->group_ptr_.size() == 0) { this->group_ptr_ = that.group_ptr_; } else { CHECK_NE(that.group_ptr_.size(), 0); auto group_ptr = that.group_ptr_; for (size_t i = 1; i < group_ptr.size(); ++i) { group_ptr[i] += this->group_ptr_.back(); } this->group_ptr_.insert(this->group_ptr_.end(), group_ptr.begin() + 1, group_ptr.end()); } } void MetaInfo::Validate(int32_t device) const { if (group_ptr_.size() != 0 && weights_.Size() != 0) { CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << "Size of weights must equal to number of groups when ranking " "group is used."; return; } if (group_ptr_.size() != 0) { CHECK_EQ(group_ptr_.back(), num_row_) << "Invalid group structure. Number of rows obtained from groups " "doesn't equal to actual number of rows given by data."; } auto check_device = [device](HostDeviceVector const &v) { CHECK(v.DeviceIdx() == GenericParameter::kCpuId || device == GenericParameter::kCpuId || v.DeviceIdx() == device) << "Data is resided on a different device than `gpu_id`. " << "Device that data is on: " << v.DeviceIdx() << ", " << "`gpu_id` for XGBoost: " << device; }; if (weights_.Size() != 0) { CHECK_EQ(weights_.Size(), num_row_) << "Size of weights must equal to number of rows."; check_device(weights_); return; } if (labels_.Size() != 0) { CHECK_EQ(labels_.Size(), num_row_) << "Size of labels must equal to number of rows."; check_device(labels_); return; } if (labels_lower_bound_.Size() != 0) { CHECK_EQ(labels_lower_bound_.Size(), num_row_) << "Size of label_lower_bound must equal to number of rows."; check_device(labels_lower_bound_); return; } if (labels_upper_bound_.Size() != 0) { CHECK_EQ(labels_upper_bound_.Size(), num_row_) << "Size of label_upper_bound must equal to number of rows."; check_device(labels_upper_bound_); return; } CHECK_LE(num_nonzero_, num_col_ * num_row_); if (base_margin_.Size() != 0) { CHECK_EQ(base_margin_.Size() % num_row_, 0) << "Size of base margin must be a multiple of number of rows."; check_device(base_margin_); } } #if !defined(XGBOOST_USE_CUDA) void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { common::AssertGPUSupport(); } #endif // !defined(XGBOOST_USE_CUDA) DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split, const std::string& file_format, const size_t page_size) { std::string fname, cache_file; size_t dlm_pos = uri.find('#'); if (dlm_pos != std::string::npos) { cache_file = uri.substr(dlm_pos + 1, uri.length()); fname = uri.substr(0, dlm_pos); CHECK_EQ(cache_file.find('#'), std::string::npos) << "Only one `#` is allowed in file path for cache file specification."; if (load_row_split) { std::ostringstream os; std::vector cache_shards = common::Split(cache_file, ':'); for (size_t i = 0; i < cache_shards.size(); ++i) { size_t pos = cache_shards[i].rfind('.'); if (pos == std::string::npos) { os << cache_shards[i] << ".r" << rabit::GetRank() << "-" << rabit::GetWorldSize(); } else { os << cache_shards[i].substr(0, pos) << ".r" << rabit::GetRank() << "-" << rabit::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length()); } if (i + 1 != cache_shards.size()) { os << ':'; } } cache_file = os.str(); } } else { fname = uri; } int partid = 0, npart = 1; if (load_row_split) { partid = rabit::GetRank(); npart = rabit::GetWorldSize(); } else { // test option to load in part npart = dmlc::GetEnv("XGBOOST_TEST_NPART", 1); } if (npart != 1) { LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts"; } // legacy handling of binary data loading if (file_format == "auto" && npart == 1) { int magic; std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r", true)); if (fi != nullptr) { common::PeekableInStream is(fi.get()); if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) && magic == data::SimpleDMatrix::kMagic) { DMatrix* dmat = new data::SimpleDMatrix(&is); if (!silent) { LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " << dmat->Info().num_nonzero_ << " entries loaded from " << uri; } return dmat; } } } std::unique_ptr > parser( dmlc::Parser::Create(fname.c_str(), partid, npart, file_format.c_str())); data::FileAdapter adapter(parser.get()); DMatrix* dmat {nullptr}; try { dmat = DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1, cache_file, page_size); } catch (dmlc::Error& e) { std::vector splited = common::Split(fname, '#'); std::vector args = common::Split(splited.front(), '?'); std::string format {file_format}; if (args.size() == 1 && file_format == "auto") { auto extension = common::Split(args.front(), '.').back(); if (extension == "csv" || extension == "libsvm") { format = extension; } if (format == extension) { LOG(WARNING) << "No format parameter is provided in input uri, but found file extension: " << format << " . " << "Consider providing a uri parameter: filename?format=" << format; } else { LOG(WARNING) << "No format parameter is provided in input uri. " << "Choosing default parser in dmlc-core. " << "Consider providing a uri parameter like: filename?format=csv"; } } LOG(FATAL) << "Encountered parser error:\n" << e.what(); } if (!silent) { LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " << dmat->Info().num_nonzero_ << " entries loaded from " << uri; } /* sync up number of features after matrix loaded. * partitioned data will fail the train/val validation check * since partitioned data not knowing the real number of features. */ rabit::Allreduce(&dmat->Info().num_col_, 1, nullptr, nullptr, fname.c_str()); // backward compatiblity code. if (!load_row_split) { MetaInfo& info = dmat->Info(); if (MetaTryLoadGroup(fname + ".group", &info.group_ptr_) && !silent) { LOG(CONSOLE) << info.group_ptr_.size() - 1 << " groups are loaded from " << fname << ".group"; } if (MetaTryLoadFloatInfo (fname + ".base_margin", &info.base_margin_.HostVector()) && !silent) { LOG(CONSOLE) << info.base_margin_.Size() << " base_margin are loaded from " << fname << ".base_margin"; } if (MetaTryLoadFloatInfo (fname + ".weight", &info.weights_.HostVector()) && !silent) { LOG(CONSOLE) << info.weights_.Size() << " weights are loaded from " << fname << ".weight"; } } return dmat; } template DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size) { if (cache_prefix.length() == 0) { // Data split mode is fixed to be row right now. return new data::SimpleDMatrix(adapter, missing, nthread); } else { #if DMLC_ENABLE_STD_THREAD return new data::SparsePageDMatrix(adapter, missing, nthread, cache_prefix, page_size); #else LOG(FATAL) << "External memory is not enabled in mingw"; return nullptr; #endif // DMLC_ENABLE_STD_THREAD } } template DMatrix* DMatrix::Create( data::DenseAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); template DMatrix* DMatrix::Create( data::CSRAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); template DMatrix* DMatrix::Create( data::CSCAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); template DMatrix* DMatrix::Create( data::DataTableAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); template DMatrix* DMatrix::Create( data::FileAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); template DMatrix * DMatrix::Create(data::IteratorAdapter *adapter, float missing, int nthread, const std::string &cache_prefix, size_t page_size); SparsePage SparsePage::GetTranspose(int num_columns) const { SparsePage transpose; common::ParallelGroupBuilder builder(&transpose.offset.HostVector(), &transpose.data.HostVector()); const int nthread = omp_get_max_threads(); builder.InitBudget(num_columns, nthread); long batch_size = static_cast(this->Size()); // NOLINT(*) #pragma omp parallel for default(none) shared(batch_size, builder) schedule(static) for (long i = 0; i < batch_size; ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto inst = (*this)[i]; for (const auto& entry : inst) { builder.AddBudget(entry.index, tid); } } builder.InitStorage(); #pragma omp parallel for default(none) shared(batch_size, builder) schedule(static) for (long i = 0; i < batch_size; ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto inst = (*this)[i]; for (const auto& entry : inst) { builder.Push( entry.index, Entry(static_cast(this->base_rowid + i), entry.fvalue), tid); } } return transpose; } void SparsePage::Push(const SparsePage &batch) { auto& data_vec = data.HostVector(); auto& offset_vec = offset.HostVector(); const auto& batch_offset_vec = batch.offset.HostVector(); const auto& batch_data_vec = batch.data.HostVector(); size_t top = offset_vec.back(); data_vec.resize(top + batch.data.Size()); std::memcpy(dmlc::BeginPtr(data_vec) + top, dmlc::BeginPtr(batch_data_vec), sizeof(Entry) * batch.data.Size()); size_t begin = offset.Size(); offset_vec.resize(begin + batch.Size()); for (size_t i = 0; i < batch.Size(); ++i) { offset_vec[i + begin] = top + batch_offset_vec[i + 1]; } } template uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread) { // Set number of threads but keep old value so we can reset it after const int nthreadmax = omp_get_max_threads(); if (nthread <= 0) nthread = nthreadmax; int nthread_original = omp_get_max_threads(); omp_set_num_threads(nthread); auto& offset_vec = offset.HostVector(); auto& data_vec = data.HostVector(); size_t builder_base_row_offset = this->Size(); common::ParallelGroupBuilder< Entry, std::remove_reference::type::value_type> builder(&offset_vec, &data_vec, builder_base_row_offset); // Estimate expected number of rows by using last element in batch // This is not required to be exact but prevents unnecessary resizing size_t expected_rows = 0; if (batch.Size() > 0) { auto last_line = batch.GetLine(batch.Size() - 1); if (last_line.Size() > 0) { expected_rows = last_line.GetElement(last_line.Size() - 1).row_idx - base_rowid; } } builder.InitBudget(expected_rows, nthread); uint64_t max_columns = 0; // First-pass over the batch counting valid elements size_t num_lines = batch.Size(); #pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < static_cast(num_lines); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto line = batch.GetLine(i); for (auto j = 0ull; j < line.Size(); j++) { data::COOTuple element = line.GetElement(j); max_columns = std::max(max_columns, static_cast(element.column_idx + 1)); if (!common::CheckNAN(element.value) && element.value != missing) { size_t key = element.row_idx - base_rowid; // Adapter row index is absolute, here we want it relative to // current page CHECK_GE(key, builder_base_row_offset); builder.AddBudget(key, tid); } } } builder.InitStorage(); // Second pass over batch, placing elements in correct position #pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < static_cast(num_lines); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); auto line = batch.GetLine(i); for (auto j = 0ull; j < line.Size(); j++) { auto element = line.GetElement(j); if (!common::CheckNAN(element.value) && element.value != missing) { size_t key = element.row_idx - base_rowid; // Adapter row index is absolute, here we want // it relative to current page builder.Push(key, Entry(element.column_idx, element.value), tid); } } } omp_set_num_threads(nthread_original); return max_columns; } void SparsePage::PushCSC(const SparsePage &batch) { std::vector& self_data = data.HostVector(); std::vector& self_offset = offset.HostVector(); auto const& other_data = batch.data.ConstHostVector(); auto const& other_offset = batch.offset.ConstHostVector(); if (other_data.empty()) { return; } if (!self_data.empty()) { CHECK_EQ(self_offset.size(), other_offset.size()) << "self_data.size(): " << this->data.Size() << ", " << "other_data.size(): " << other_data.size() << std::flush; } else { self_data = other_data; self_offset = other_offset; return; } std::vector offset(other_offset.size()); offset[0] = 0; std::vector data(self_data.size() + other_data.size()); // n_cols in original csr data matrix, here in csc is n_rows size_t const n_features = other_offset.size() - 1; size_t beg = 0; size_t ptr = 1; for (size_t i = 0; i < n_features; ++i) { size_t const self_beg = self_offset.at(i); size_t const self_length = self_offset.at(i+1) - self_beg; // It is possible that the current feature and further features aren't referenced // in any rows accumulated thus far. It is also possible for this to happen // in the current sparse page row batch as well. // Hence, the incremental number of rows may stay constant thus equaling the data size CHECK_LE(beg, data.size()); std::memcpy(dmlc::BeginPtr(data)+beg, dmlc::BeginPtr(self_data) + self_beg, sizeof(Entry) * self_length); beg += self_length; size_t const other_beg = other_offset.at(i); size_t const other_length = other_offset.at(i+1) - other_beg; CHECK_LE(beg, data.size()); std::memcpy(dmlc::BeginPtr(data)+beg, dmlc::BeginPtr(other_data) + other_beg, sizeof(Entry) * other_length); beg += other_length; CHECK_LT(ptr, offset.size()); offset.at(ptr) = beg; ptr++; } self_data = std::move(data); self_offset = std::move(offset); } namespace data { // List of files that will be force linked in static links. DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format); } // namespace data } // namespace xgboost