Cleanup set info. (#10139)

- Use the array interface internally.
- Deprecate `XGDMatrixSetDenseInfo`.
- Deprecate `XGDMatrixSetUIntInfo`.
- Move the handling of `DataType` into the deprecated C function.

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan
2024-03-26 23:26:24 +08:00
committed by GitHub
parent 6a7c6a8ae6
commit 230010d9a0
37 changed files with 246 additions and 268 deletions

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2014-2024 by XGBoost Contributors
* Copyright 2014-2024, XGBoost Contributors
*/
#include "xgboost/c_api.h"
@@ -614,8 +614,8 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
auto const& p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kFloat32, len);
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}
@@ -634,8 +634,9 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kUInt32, len);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}
@@ -679,19 +680,52 @@ XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void
xgboost::bst_ulong size, int type) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
CHECK(type >= 1 && type <= 4);
xgboost_CHECK_C_ARG_PTR(field);
p_fmat->SetInfo(field, data, static_cast<DataType>(type), size);
API_END();
}
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned *group, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << "XGDMatrixSetGroup is deprecated, use `XGDMatrixSetUIntInfo` instead.";
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo("group", group, xgboost::DataType::kUInt32, len);
Context ctx;
auto dtype = static_cast<DataType>(type);
std::string str;
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(
common::Span<T>{cast_d_ptr, static_cast<typename common::Span<T>::index_type>(size)},
{size}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface{linalg::ArrayInterface(t)};
CHECK(ArrayInterface<1>{interface}.is_contiguous);
str = Json::Dump(interface);
return str;
};
// Legacy code using XGBoost dtype, which is a small subset of array interface types.
switch (dtype) {
case xgboost::DataType::kFloat32: {
auto cast_ptr = reinterpret_cast<const float *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kDouble: {
auto cast_ptr = reinterpret_cast<const double *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt32: {
auto cast_ptr = reinterpret_cast<const uint32_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt64: {
auto cast_ptr = reinterpret_cast<const uint64_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
default:
LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
}
API_END();
}
@@ -987,7 +1021,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
bst_float *hess, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
auto *learner = static_cast<Learner *>(handle);
auto ctx = learner->Ctx()->MakeCPU();

View File

@@ -1,17 +1,18 @@
/**
* Copyright 2021-2023, XGBoost Contributors
* Copyright 2021-2024, XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory> // for shared_ptr
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <functional> // for multiplies
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector> // for vector
#include "../common/json_utils.h" // for TypeCheck
#include "xgboost/c_api.h"

View File

@@ -2,6 +2,8 @@
* Copyright 2023 XGBoost contributors
*/
#if defined(XGBOOST_USE_NCCL)
#include <numeric> // for accumulate
#include "comm.cuh"
#include "nccl_device_communicator.cuh"

View File

@@ -11,7 +11,7 @@
#include "xgboost/logging.h"
namespace xgboost::error {
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
std::stringstream ss;
ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
return ss.str();

View File

@@ -89,7 +89,7 @@ void WarnDeprecatedGPUId();
void WarnEmptyDataset();
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
constexpr StringView InvalidCUDAOrdinal() {
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "

View File

@@ -6,7 +6,6 @@
#include <algorithm>
#include <cstdint>
#include <mutex>
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"

View File

@@ -4,6 +4,7 @@
#include "quantile.h"
#include <limits>
#include <numeric> // for partial_sum
#include <utility>
#include "../collective/aggregator.h"

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2023 by XGBoost Contributors
* Copyright 2020-2024, XGBoost Contributors
*/
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
@@ -8,8 +8,8 @@
#include <thrust/transform_scan.h>
#include <thrust/unique.h>
#include <limits> // std::numeric_limits
#include <memory>
#include <limits> // std::numeric_limits
#include <numeric> // for partial_sum
#include <utility>
#include "../collective/communicator-inl.cuh"

View File

@@ -1,8 +1,9 @@
/**
* Copyright 2020-2024, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
#define XGBOOST_COMMON_QUANTILE_CUH_
#include <memory>
#include "xgboost/span.h"
#include "xgboost/data.h"
#include "device_helpers.cuh"

View File

@@ -11,7 +11,6 @@
#include <cmath> // for abs
#include <cstdint> // for uint64_t, int32_t, uint8_t, uint32_t
#include <cstring> // for size_t, strcmp, memcpy
#include <exception> // for exception
#include <iostream> // for operator<<, basic_ostream, basic_ostream::op...
#include <map> // for map, operator!=
#include <numeric> // for accumulate, partial_sum
@@ -22,7 +21,6 @@
#include "../collective/communicator.h" // for Operation
#include "../common/algorithm.h" // for StableSort
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/common.h" // for Split
#include "../common/error_msg.h" // for GroupSize, GroupWeight, InfInData
#include "../common/group_data.h" // for ParallelGroupBuilder
#include "../common/io.h" // for PeekableInStream
@@ -473,11 +471,11 @@ void MetaInfo::SetInfo(Context const& ctx, StringView key, StringView interface_
<< ", must have at least 1 column even if it's empty.";
auto const& first = get<Object const>(array.front());
auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
} else {
auto const& first = get<Object const>(j_interface);
auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
}
if (is_cuda) {
@@ -567,46 +565,6 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
}
}
void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype,
size_t num) {
CHECK(key);
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface {
linalg::ArrayInterface(t)
};
assert(ArrayInterface<1>{interface}.is_contiguous);
return interface;
};
// Legacy code using XGBoost dtype, which is a small subset of array interface types.
switch (dtype) {
case xgboost::DataType::kFloat32: {
auto cast_ptr = reinterpret_cast<const float*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kDouble: {
auto cast_ptr = reinterpret_cast<const double*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt32: {
auto cast_ptr = reinterpret_cast<const uint32_t*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt64: {
auto cast_ptr = reinterpret_cast<const uint64_t*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
default:
LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
}
}
void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const {
if (dtype == DataType::kFloat32) {

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021-2023, XGBoost contributors
* Copyright 2021-2024, XGBoost contributors
*/
#include "file_iterator.h"
@@ -10,7 +10,10 @@
#include <ostream> // for operator<<, basic_ostream, istringstream
#include <vector> // for vector
#include "../common/common.h" // for Split
#include "../common/common.h" // for Split
#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
#include "xgboost/linalg.h"
#include "xgboost/logging.h" // for CHECK
#include "xgboost/string_view.h" // for operator<<, StringView
namespace xgboost::data {
@@ -28,10 +31,10 @@ std::string ValidateFileFormat(std::string const& uri) {
for (size_t i = 0; i < arg_list.size(); ++i) {
std::istringstream is(arg_list[i]);
std::pair<std::string, std::string> kv;
CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
<< " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
<< " for value in arg " << i + 1;
CHECK(std::getline(is, kv.first, '='))
<< "Invalid uri argument format" << " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second))
<< "Invalid uri argument format" << " for value in arg " << i + 1;
args.insert(kv);
}
if (args.find("format") == args.cend()) {
@@ -48,4 +51,41 @@ std::string ValidateFileFormat(std::string const& uri) {
return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
}
}
int FileIterator::Next() {
CHECK(parser_);
if (parser_->Next()) {
row_block_ = parser_->Value();
indptr_ = linalg::Make1dInterface(row_block_.offset, row_block_.size + 1);
values_ = linalg::Make1dInterface(row_block_.value, row_block_.offset[row_block_.size]);
indices_ = linalg::Make1dInterface(row_block_.index, row_block_.offset[row_block_.size]);
size_t n_columns =
*std::max_element(row_block_.index, row_block_.index + row_block_.offset[row_block_.size]);
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
// this condition and just add 1 to n_columns
n_columns += 1;
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(), values_.c_str(), n_columns);
if (row_block_.label) {
auto str = linalg::Make1dInterface(row_block_.label, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "label", str.c_str());
}
if (row_block_.qid) {
auto str = linalg::Make1dInterface(row_block_.qid, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "qid", str.c_str());
}
if (row_block_.weight) {
auto str = linalg::Make1dInterface(row_block_.weight, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "weight", str.c_str());
}
// Continue iteration
return true;
} else {
// Stop iteration
return false;
}
}
} // namespace xgboost::data

View File

@@ -1,20 +1,16 @@
/**
* Copyright 2021-2023, XGBoost contributors
* Copyright 2021-2024, XGBoost contributors
*/
#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
#define XGBOOST_DATA_FILE_ITERATOR_H_
#include <algorithm> // for max_element
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t
#include <memory> // for unique_ptr
#include <string> // for string
#include <utility> // for move
#include "dmlc/data.h" // for RowBlock, Parser
#include "xgboost/c_api.h" // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
#include "xgboost/logging.h" // for CHECK
#include "xgboost/c_api.h" // for XGDMatrixFree, XGProxyDMatrixCreate
namespace xgboost::data {
[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
@@ -53,41 +49,7 @@ class FileIterator {
XGDMatrixFree(proxy_);
}
int Next() {
CHECK(parser_);
if (parser_->Next()) {
row_block_ = parser_->Value();
using linalg::MakeVec;
indptr_ = ArrayInterfaceStr(MakeVec(row_block_.offset, row_block_.size + 1));
values_ = ArrayInterfaceStr(MakeVec(row_block_.value, row_block_.offset[row_block_.size]));
indices_ = ArrayInterfaceStr(MakeVec(row_block_.index, row_block_.offset[row_block_.size]));
size_t n_columns = *std::max_element(row_block_.index,
row_block_.index + row_block_.offset[row_block_.size]);
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
// this condition and just add 1 to n_columns
n_columns += 1;
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
values_.c_str(), n_columns);
if (row_block_.label) {
XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
}
if (row_block_.qid) {
XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
}
if (row_block_.weight) {
XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
}
// Continue iteration
return true;
} else {
// Stop iteration
return false;
}
}
int Next();
auto Proxy() -> decltype(proxy_) { return proxy_; }

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2014-2023 by Contributors
* Copyright 2014-2024, XGBoost Contributors
* \file gbtree.cc
* \brief gradient boosted tree implementation.
* \author Tianqi Chen
@@ -11,14 +11,12 @@
#include <algorithm>
#include <cstdint> // std::int32_t
#include <map>
#include <memory>
#include <numeric> // for iota
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "../common/common.h"
#include "../common/timer.h"
#include "../tree/param.h" // TrainParam
#include "gbtree_model.h"

View File

@@ -10,15 +10,15 @@
#include <array>
#include <cmath>
#include <numeric> // for accumulate
#include "../collective/communicator-inl.h"
#include "../common/common.h" // MetricNoCache
#include "../common/common.h" // for AssertGPUSupport
#include "../common/math.h"
#include "../common/optional_weight.h" // OptionalWeights
#include "../common/pseudo_huber.h"
#include "../common/quantile_loss_utils.h" // QuantileLossParam
#include "../common/threading_utils.h"
#include "metric_common.h"
#include "metric_common.h" // MetricNoCache
#include "xgboost/collective/result.h" // for SafeColl
#include "xgboost/metric.h"

View File

@@ -9,8 +9,6 @@
#include <string>
#include "../collective/aggregator.h"
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "xgboost/metric.h"
namespace xgboost {

View File

@@ -9,8 +9,8 @@
#include <array>
#include <atomic>
#include <cmath>
#include <numeric> // for accumulate
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/threading_utils.h"
#include "metric_common.h" // MetricNoCache

View File

@@ -9,10 +9,9 @@
#include <array>
#include <memory>
#include <numeric> // for accumulate
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/survival_util.h"
#include "../common/threading_utils.h"
#include "metric_common.h" // MetricNoCache