merge latest, Jan 12 2024

This commit is contained in:
Hui Liu
2024-01-12 09:57:11 -08:00
251 changed files with 9023 additions and 5012 deletions

View File

@@ -25,9 +25,7 @@
#include "xgboost/span.h"
#include "xgboost/string_view.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
/** External data formats should implement an adapter as below. The
* adapter provides a uniform access to data outside xgboost, allowing
* construction of DMatrix objects from a range of sources without duplicating
@@ -279,9 +277,9 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
return Line{array_interface_, idx};
}
size_t NumRows() const { return array_interface_.Shape(0); }
size_t NumCols() const { return array_interface_.Shape(1); }
size_t Size() const { return this->NumRows(); }
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
[[nodiscard]] std::size_t NumCols() const { return array_interface_.Shape(1); }
[[nodiscard]] std::size_t Size() const { return this->NumRows(); }
explicit ArrayAdapterBatch(ArrayInterface<2> array_interface)
: array_interface_{std::move(array_interface)} {}
@@ -326,11 +324,11 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo {
: indices_{std::move(indices)}, values_{std::move(values)}, ridx_{ridx},
offset_{offset} {}
COOTuple GetElement(std::size_t idx) const {
[[nodiscard]] COOTuple GetElement(std::size_t idx) const {
return {ridx_, TypedIndex<std::size_t, 1>{indices_}(offset_ + idx), values_(offset_ + idx)};
}
size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return values_.Shape(0);
}
};
@@ -539,9 +537,11 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
batch_{CSCArrayAdapterBatch{indptr_, indices_, values_}} {}
// JVM package sends 0 as unknown
size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }
size_t NumColumns() const { return indptr_.n - 1; }
const CSCArrayAdapterBatch& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const {
return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_;
}
[[nodiscard]] std::size_t NumColumns() const { return indptr_.n - 1; }
[[nodiscard]] const CSCArrayAdapterBatch& Value() const override { return batch_; }
};
class DataTableAdapterBatch : public detail::NoMetaInfo {
@@ -634,15 +634,15 @@ class DataTableAdapterBatch : public detail::NoMetaInfo {
public:
Line(std::size_t ridx, void const* const* const data, std::vector<DTType> const& ft)
: row_idx_{ridx}, data_{data}, feature_types_{ft} {}
std::size_t Size() const { return feature_types_.size(); }
COOTuple GetElement(std::size_t idx) const {
[[nodiscard]] std::size_t Size() const { return feature_types_.size(); }
[[nodiscard]] COOTuple GetElement(std::size_t idx) const {
return COOTuple{row_idx_, idx, DTGetValue(data_[idx], feature_types_[idx], row_idx_)};
}
};
public:
size_t Size() const { return num_rows_; }
const Line GetLine(std::size_t ridx) const { return {ridx, data_, feature_types_}; }
[[nodiscard]] size_t Size() const { return num_rows_; }
[[nodiscard]] const Line GetLine(std::size_t ridx) const { return {ridx, data_, feature_types_}; }
static constexpr bool kIsRowMajor = true;
private:
@@ -659,9 +659,9 @@ class DataTableAdapter : public detail::SingleBatchDataIter<DataTableAdapterBatc
: batch_(data, feature_stypes, num_rows, num_features),
num_rows_(num_rows),
num_columns_(num_features) {}
const DataTableAdapterBatch& Value() const override { return batch_; }
std::size_t NumRows() const { return num_rows_; }
std::size_t NumColumns() const { return num_columns_; }
[[nodiscard]] const DataTableAdapterBatch& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const { return num_rows_; }
[[nodiscard]] std::size_t NumColumns() const { return num_columns_; }
private:
DataTableAdapterBatch batch_;
@@ -669,6 +669,74 @@ class DataTableAdapter : public detail::SingleBatchDataIter<DataTableAdapterBatc
std::size_t num_columns_;
};
class ColumnarAdapterBatch : public detail::NoMetaInfo {
common::Span<ArrayInterface<1, false>> columns_;
class Line {
common::Span<ArrayInterface<1, false>> const& columns_;
std::size_t ridx_;
public:
explicit Line(common::Span<ArrayInterface<1, false>> const& columns, std::size_t ridx)
: columns_{columns}, ridx_{ridx} {}
[[nodiscard]] std::size_t Size() const { return columns_.empty() ? 0 : columns_.size(); }
[[nodiscard]] COOTuple GetElement(std::size_t idx) const {
return {ridx_, idx, columns_[idx](ridx_)};
}
};
public:
ColumnarAdapterBatch() = default;
explicit ColumnarAdapterBatch(common::Span<ArrayInterface<1, false>> columns)
: columns_{columns} {}
[[nodiscard]] Line GetLine(std::size_t ridx) const { return Line{columns_, ridx}; }
[[nodiscard]] std::size_t Size() const {
return columns_.empty() ? 0 : columns_.front().Shape(0);
}
[[nodiscard]] std::size_t NumCols() const { return columns_.empty() ? 0 : columns_.size(); }
[[nodiscard]] std::size_t NumRows() const { return this->Size(); }
static constexpr bool kIsRowMajor = true;
};
class ColumnarAdapter : public detail::SingleBatchDataIter<ColumnarAdapterBatch> {
std::vector<ArrayInterface<1, false>> columns_;
ColumnarAdapterBatch batch_;
public:
explicit ColumnarAdapter(StringView columns) {
auto jarray = Json::Load(columns);
CHECK(IsA<Array>(jarray));
auto const& array = get<Array const>(jarray);
for (auto col : array) {
columns_.emplace_back(get<Object const>(col));
}
bool consistent =
columns_.empty() ||
std::all_of(columns_.cbegin(), columns_.cend(), [&](ArrayInterface<1, false> const& array) {
return array.Shape(0) == columns_[0].Shape(0);
});
CHECK(consistent) << "Size of columns should be the same.";
batch_ = ColumnarAdapterBatch{columns_};
}
[[nodiscard]] ColumnarAdapterBatch const& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const {
if (!columns_.empty()) {
return columns_.front().shape[0];
}
return 0;
}
[[nodiscard]] std::size_t NumColumns() const {
if (!columns_.empty()) {
return columns_.size();
}
return 0;
}
};
class FileAdapterBatch {
public:
class Line {
@@ -851,6 +919,5 @@ class SparsePageAdapterBatch {
Line GetLine(size_t ridx) const { return Line{page_[ridx].data(), page_[ridx].size(), ridx}; }
size_t Size() const { return page_.Size(); }
};
}; // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_ADAPTER_H_

View File

@@ -0,0 +1,13 @@
/**
* Copyright 2019-2024, XGBoost Contributors
*/
#include "array_interface.h"
#include "../common/common.h" // for AssertGPUSupport
namespace xgboost {
#if !defined(XGBOOST_USE_CUDA)
void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace xgboost

View File

@@ -377,11 +377,6 @@ struct ToDType<int64_t> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
};
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
inline void ArrayInterfaceHandler::SyncCudaStream(int64_t) { common::AssertGPUSupport(); }
inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
#endif // !defined(XGBOOST_USE_CUDA)
/**
* \brief A type erased view over __array_interface__ protocol defined by numpy
*

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2015-2023 by XGBoost Contributors
* Copyright 2015-2024, XGBoost Contributors
* \file data.cc
*/
#include "xgboost/data.h"
@@ -260,9 +260,14 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
}
void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<FeatureType>* types) {
/**
* @brief Load feature type info from names, returns whether there's categorical features.
*/
[[nodiscard]] bool LoadFeatureType(std::vector<std::string> const& type_names,
std::vector<FeatureType>* types) {
types->clear();
for (auto const &elem : type_names) {
bool has_cat{false};
for (auto const& elem : type_names) {
if (elem == "int") {
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "float") {
@@ -273,10 +278,12 @@ void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<Feat
types->emplace_back(FeatureType::kNumerical);
} else if (elem == "c") {
types->emplace_back(FeatureType::kCategorical);
has_cat = true;
} else {
LOG(FATAL) << "All feature_types must be one of {int, float, i, q, c}.";
}
}
return has_cat;
}
const std::vector<size_t>& MetaInfo::LabelAbsSort(Context const* ctx) const {
@@ -340,7 +347,8 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
LoadVectorField(fi, u8"feature_weights", DataType::kFloat32, &feature_weights);
LoadFeatureType(feature_type_names, &feature_types.HostVector());
this->has_categorical_ = LoadFeatureType(feature_type_names, &feature_types.HostVector());
}
template <typename T>
@@ -639,6 +647,7 @@ void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulon
CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
CHECK(info);
}
if (!std::strcmp(key, "feature_type")) {
feature_type_names.clear();
for (size_t i = 0; i < size; ++i) {
@@ -651,7 +660,7 @@ void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulon
<< "Length of " << key << " must be equal to number of columns.";
}
auto& h_feature_types = feature_types.HostVector();
LoadFeatureType(feature_type_names, &h_feature_types);
this->has_categorical_ = LoadFeatureType(feature_type_names, &h_feature_types);
} else if (!std::strcmp(key, "feature_name")) {
if (IsColumnSplit()) {
std::vector<std::string> local_feature_names{};
@@ -674,9 +683,8 @@ void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulon
}
}
void MetaInfo::GetFeatureInfo(const char *field,
std::vector<std::string> *out_str_vecs) const {
auto &str_vecs = *out_str_vecs;
void MetaInfo::GetFeatureInfo(const char* field, std::vector<std::string>* out_str_vecs) const {
auto& str_vecs = *out_str_vecs;
if (!std::strcmp(field, "feature_type")) {
str_vecs.resize(feature_type_names.size());
std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin());
@@ -689,6 +697,9 @@ void MetaInfo::GetFeatureInfo(const char *field,
}
void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_column) {
/**
* shape
*/
if (accumulate_rows) {
this->num_row_ += that.num_row_;
}
@@ -702,6 +713,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
}
this->num_col_ = that.num_col_;
/**
* info with n_samples
*/
linalg::Stack(&this->labels, that.labels);
this->weights_.SetDevice(that.weights_.Device());
@@ -715,6 +729,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
linalg::Stack(&this->base_margin_, that.base_margin_);
/**
* group
*/
if (this->group_ptr_.size() == 0) {
this->group_ptr_ = that.group_ptr_;
} else {
@@ -727,17 +744,25 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
group_ptr.end());
}
/**
* info with n_features
*/
if (!that.feature_names.empty()) {
this->feature_names = that.feature_names;
}
if (!that.feature_type_names.empty()) {
this->feature_type_names = that.feature_type_names;
auto &h_feature_types = feature_types.HostVector();
LoadFeatureType(this->feature_type_names, &h_feature_types);
auto& h_feature_types = feature_types.HostVector();
this->has_categorical_ = LoadFeatureType(this->feature_type_names, &h_feature_types);
} else if (!that.feature_types.Empty()) {
// FIXME(jiamingy): https://github.com/dmlc/xgboost/pull/9171/files#r1440188612
this->feature_types.Resize(that.feature_types.Size());
this->feature_types.Copy(that.feature_types);
auto const& ft = this->feature_types.ConstHostVector();
this->has_categorical_ = std::any_of(ft.cbegin(), ft.cend(), common::IsCatOp{});
}
if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size());
this->feature_weights.SetDevice(that.feature_weights.Device());
@@ -947,38 +972,24 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const st
return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
}
template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
float missing, std::int32_t nthread,
const std::string& cache_prefix,
DataSplitMode data_split_mode);
// Instantiate the factory function for various adapters
#define INSTANTIATION_CREATE(_AdapterT) \
template DMatrix* DMatrix::Create<data::_AdapterT>( \
data::_AdapterT * adapter, float missing, std::int32_t nthread, \
const std::string& cache_prefix, DataSplitMode data_split_mode);
INSTANTIATION_CREATE(DenseAdapter)
INSTANTIATION_CREATE(ArrayAdapter)
INSTANTIATION_CREATE(CSRAdapter)
INSTANTIATION_CREATE(CSCAdapter)
INSTANTIATION_CREATE(DataTableAdapter)
INSTANTIATION_CREATE(FileAdapter)
INSTANTIATION_CREATE(CSRArrayAdapter)
INSTANTIATION_CREATE(CSCArrayAdapter)
INSTANTIATION_CREATE(ColumnarAdapter)
#undef INSTANTIATION_CREATE
template DMatrix* DMatrix::Create(
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
@@ -1156,7 +1167,6 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
builder.InitStorage();
// Second pass over batch, placing elements in correct position
auto is_valid = data::IsValidFunctor{missing};
#pragma omp parallel num_threads(nthread)
{
@@ -1253,9 +1263,10 @@ template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float mis
template uint64_t SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing,
int nthread);
template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
template uint64_t SparsePage::Push(const data::ColumnarAdapterBatch& batch, float missing,
std::int32_t nthread);
namespace data {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
DMLC_REGISTRY_LINK_TAG(gradient_index_format);

View File

@@ -120,7 +120,7 @@ void GHistIndexMatrix::PushAdapterBatchColumns(Context const *ctx, Batch const &
INSTANTIATION_PUSH(data::CSRArrayAdapterBatch)
INSTANTIATION_PUSH(data::ArrayAdapterBatch)
INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
INSTANTIATION_PUSH(data::ColumnarAdapterBatch)
#undef INSTANTIATION_PUSH
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {

View File

@@ -93,7 +93,7 @@ class IterativeDMatrix : public DMatrix {
return nullptr;
}
BatchSet<SparsePage> GetRowBatches() override {
LOG(FATAL) << "Not implemented.";
LOG(FATAL) << "Not implemented for `QuantileDMatrix`.";
return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
}
BatchSet<CSCPage> GetColumnBatches(Context const *) override {

View File

@@ -5,7 +5,22 @@
#include "proxy_dmatrix.h"
#include <memory> // for shared_ptr
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // for StringView
namespace xgboost::data {
void DMatrixProxy::SetColumnarData(StringView interface_str) {
std::shared_ptr<ColumnarAdapter> adapter{new ColumnarAdapter{interface_str}};
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
this->ctx_.Init(Args{{"device", "cpu"}});
}
void DMatrixProxy::SetArrayData(StringView interface_str) {
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
this->batch_ = adapter;

View File

@@ -62,6 +62,8 @@ class DMatrixProxy : public DMatrix {
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
void SetColumnarData(StringView interface_str);
void SetArrayData(StringView interface_str);
void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
bst_feature_t n_features, bool on_host);
@@ -151,6 +153,17 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
if (type_error) {
*type_error = false;
}
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<ColumnarAdapter>)) {
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<ColumnarAdapter>>(proxy->Adapter());
return fn(value);
}
if (type_error) {
*type_error = false;
}
} else {
if (type_error) {
*type_error = true;

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2014~2023 by XGBoost Contributors
* Copyright 2014~2023, XGBoost Contributors
* \file simple_dmatrix.cc
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
@@ -356,6 +356,8 @@ template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(ColumnarAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(
IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, DataSplitMode data_split_mode);