Cleanup set info. (#10139)

- Use the array interface internally.
- Deprecate `XGDMatrixSetDenseInfo`.
- Deprecate `XGDMatrixSetUIntInfo`.
- Move the handling of `DataType` into the deprecated C function.

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan 2024-03-26 23:26:24 +08:00 committed by GitHub
parent 6a7c6a8ae6
commit 230010d9a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 246 additions and 268 deletions

View File

@ -110,7 +110,7 @@ jobs:
name: Test R package on Debian
runs-on: ubuntu-latest
container:
image: rhub/debian-gcc-devel
image: rhub/debian-gcc-release
steps:
- name: Install system dependencies
@ -130,12 +130,12 @@ jobs:
- name: Install dependencies
shell: bash -l {0}
run: |
/tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
- name: Test R
shell: bash -l {0}
run: |
python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --build-tool=autotools --task=check
python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
- uses: dorny/paths-filter@v2
id: changes
@ -147,4 +147,4 @@ jobs:
- name: Run document check
if: steps.changes.outputs.r_package == 'true'
run: |
python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --task=doc
python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc

View File

@ -1,5 +1,5 @@
/**
* Copyright 2015~2023 by XGBoost Contributors
* Copyright 2015-2024, XGBoost Contributors
* \file c_api.h
* \author Tianqi Chen
* \brief C API of XGBoost, used for interfacing to other languages.
@ -639,21 +639,14 @@ XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle,
* \param len length of array
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
const char *field,
const float *array,
XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const float *array,
bst_ulong len);
/*!
* \brief set uint32 vector to a content in info
* \param handle a instance of data matrix
* \param field field name
* \param array pointer to unsigned int vector
* \param len length of array
* \return 0 when success, -1 when failure happens
/**
* @deprecated since 2.1.0
*
* Use @ref XGDMatrixSetInfoFromInterface instead.
*/
XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const char *field,
const unsigned *array,
XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const unsigned *array,
bst_ulong len);
/*!
@ -725,42 +718,13 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
bst_ulong *size,
const char ***out_features);
/*!
* \brief Set meta info from dense matrix. Valid field names are:
/**
* @deprecated since 2.1.0
*
* - label
* - weight
* - base_margin
* - group
* - label_lower_bound
* - label_upper_bound
* - feature_weights
*
* \param handle An instance of data matrix
* \param field Field name
* \param data Pointer to consecutive memory storing data.
* \param size Size of the data, this is relative to size of type. (Meaning NOT number
* of bytes.)
* \param type Indicator of data type. This is defined in xgboost::DataType enum class.
* - float = 1
* - double = 2
* - uint32_t = 3
* - uint64_t = 4
* \return 0 when success, -1 when failure happens
* Use @ref XGDMatrixSetInfoFromInterface instead.
*/
XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
void const *data, bst_ulong size, int type);
/*!
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
const unsigned *group,
bst_ulong len);
XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data,
bst_ulong size, int type);
/*!
* \brief get float info vector from matrix.

View File

@ -19,7 +19,6 @@
#include <algorithm>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <utility>
#include <vector>
@ -137,14 +136,6 @@ class MetaInfo {
* \param fo The output stream.
*/
void SaveBinary(dmlc::Stream* fo) const;
/*!
* \brief Set information in the meta info.
* \param key The key of the information.
* \param dptr The data pointer of the source array.
* \param dtype The type of the source data.
* \param num Number of elements in the source array.
*/
void SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype, size_t num);
/*!
* \brief Set information in the meta info with array interface.
* \param key The key of the information.
@ -517,10 +508,6 @@ class DMatrix {
DMatrix() = default;
/*! \brief meta information of the dataset */
virtual MetaInfo& Info() = 0;
virtual void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
auto const& ctx = *this->Ctx();
this->Info().SetInfo(ctx, key, dptr, dtype, num);
}
virtual void SetInfo(const char* key, std::string const& interface_str) {
auto const& ctx = *this->Ctx();
this->Info().SetInfo(ctx, key, StringView{interface_str});

View File

@ -190,13 +190,14 @@ constexpr auto ArrToTuple(T (&arr)[N]) {
// uint division optimization inspired by the CIndexer in cupy. Division operation is
// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64
// bit when the index is smaller, then try to avoid division when it's exp of 2.
template <typename I, int32_t D>
template <typename I, std::int32_t D>
LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
size_t index[D]{0};
std::size_t index[D]{0};
static_assert(std::is_signed<decltype(D)>::value,
"Don't change the type without changing the for loop.");
auto const sptr = shape.data();
for (int32_t dim = D; --dim > 0;) {
auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape[dim]);
auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(sptr[dim]);
if (s & (s - 1)) {
auto t = idx / s;
index[dim] = idx - t * s;
@ -745,6 +746,14 @@ auto ArrayInterfaceStr(TensorView<T, D> const &t) {
return str;
}
template <typename T>
auto Make1dInterface(T const *vec, std::size_t len) {
Context ctx;
auto t = linalg::MakeTensorView(&ctx, common::Span{vec, len}, len);
auto str = linalg::ArrayInterfaceStr(t);
return str;
}
/**
* \brief A tensor storage. To use it for other functionality like slicing one needs to
* obtain a view first. This way we can use it on both host and device.

View File

@ -30,9 +30,8 @@
#define XGBOOST_SPAN_H_
#include <xgboost/base.h>
#include <xgboost/logging.h>
#include <cinttypes> // size_t
#include <cstddef> // size_t
#include <cstdio>
#include <iterator>
#include <limits> // numeric_limits
@ -73,8 +72,7 @@
#endif // defined(_MSC_VER) && _MSC_VER < 1910
namespace xgboost {
namespace common {
namespace xgboost::common {
#if defined(__CUDA_ARCH__)
// Usual logging facility is not available inside device code.
@ -707,8 +705,8 @@ class IterSpan {
return it_ + size();
}
};
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#if defined(_MSC_VER) &&_MSC_VER < 1910
#undef constexpr

View File

@ -408,7 +408,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetFloatI
jfloat* array = jenv->GetFloatArrayElements(jarray, NULL);
bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
int ret = XGDMatrixSetFloatInfo(handle, field, (float const *)array, len);
auto str = xgboost::linalg::Make1dInterface(array, len);
int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
JVM_CHECK_CALL(ret);
//release
if (field) jenv->ReleaseStringUTFChars(jfield, field);
@ -427,7 +428,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetUIntIn
const char* field = jenv->GetStringUTFChars(jfield, 0);
jint* array = jenv->GetIntArrayElements(jarray, NULL);
bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
int ret = XGDMatrixSetUIntInfo(handle, (char const *)field, (unsigned int const *)array, len);
auto str = xgboost::linalg::Make1dInterface(array, len);
int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
JVM_CHECK_CALL(ret);
//release
if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field);
@ -730,8 +732,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFr
if (jmargin) {
margin = jenv->GetFloatArrayElements(jmargin, nullptr);
JVM_CHECK_CALL(XGProxyDMatrixCreate(&proxy));
JVM_CHECK_CALL(
XGDMatrixSetFloatInfo(proxy, "base_margin", margin, jenv->GetArrayLength(jmargin)));
auto str = xgboost::linalg::Make1dInterface(margin, jenv->GetArrayLength(jmargin));
JVM_CHECK_CALL(XGDMatrixSetInfoFromInterface(proxy, "base_margin", str.c_str()));
}
bst_ulong const *out_shape;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2014-2024 by XGBoost Contributors
* Copyright 2014-2024, XGBoost Contributors
*/
#include "xgboost/c_api.h"
@ -614,8 +614,8 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
auto const& p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kFloat32, len);
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}
@ -634,8 +634,9 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kUInt32, len);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}
@ -679,19 +680,52 @@ XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void
xgboost::bst_ulong size, int type) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
CHECK(type >= 1 && type <= 4);
xgboost_CHECK_C_ARG_PTR(field);
p_fmat->SetInfo(field, data, static_cast<DataType>(type), size);
API_END();
}
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned *group, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << "XGDMatrixSetGroup is deprecated, use `XGDMatrixSetUIntInfo` instead.";
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo("group", group, xgboost::DataType::kUInt32, len);
Context ctx;
auto dtype = static_cast<DataType>(type);
std::string str;
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(
common::Span<T>{cast_d_ptr, static_cast<typename common::Span<T>::index_type>(size)},
{size}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface{linalg::ArrayInterface(t)};
CHECK(ArrayInterface<1>{interface}.is_contiguous);
str = Json::Dump(interface);
return str;
};
// Legacy code using XGBoost dtype, which is a small subset of array interface types.
switch (dtype) {
case xgboost::DataType::kFloat32: {
auto cast_ptr = reinterpret_cast<const float *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kDouble: {
auto cast_ptr = reinterpret_cast<const double *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt32: {
auto cast_ptr = reinterpret_cast<const uint32_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt64: {
auto cast_ptr = reinterpret_cast<const uint64_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
default:
LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
}
API_END();
}
@ -987,7 +1021,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
bst_float *hess, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
auto *learner = static_cast<Learner *>(handle);
auto ctx = learner->Ctx()->MakeCPU();

View File

@ -1,17 +1,18 @@
/**
* Copyright 2021-2023, XGBoost Contributors
* Copyright 2021-2024, XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory> // for shared_ptr
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <functional> // for multiplies
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector> // for vector
#include "../common/json_utils.h" // for TypeCheck
#include "xgboost/c_api.h"

View File

@ -2,6 +2,8 @@
* Copyright 2023 XGBoost contributors
*/
#if defined(XGBOOST_USE_NCCL)
#include <numeric> // for accumulate
#include "comm.cuh"
#include "nccl_device_communicator.cuh"

View File

@ -11,7 +11,7 @@
#include "xgboost/logging.h"
namespace xgboost::error {
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
std::stringstream ss;
ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
return ss.str();

View File

@ -89,7 +89,7 @@ void WarnDeprecatedGPUId();
void WarnEmptyDataset();
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
constexpr StringView InvalidCUDAOrdinal() {
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "

View File

@ -6,7 +6,6 @@
#include <algorithm>
#include <cstdint>
#include <mutex>
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"

View File

@ -4,6 +4,7 @@
#include "quantile.h"
#include <limits>
#include <numeric> // for partial_sum
#include <utility>
#include "../collective/aggregator.h"

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2023 by XGBoost Contributors
* Copyright 2020-2024, XGBoost Contributors
*/
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
@ -8,8 +8,8 @@
#include <thrust/transform_scan.h>
#include <thrust/unique.h>
#include <limits> // std::numeric_limits
#include <memory>
#include <limits> // std::numeric_limits
#include <numeric> // for partial_sum
#include <utility>
#include "../collective/communicator-inl.cuh"

View File

@ -1,8 +1,9 @@
/**
* Copyright 2020-2024, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
#define XGBOOST_COMMON_QUANTILE_CUH_
#include <memory>
#include "xgboost/span.h"
#include "xgboost/data.h"
#include "device_helpers.cuh"

View File

@ -11,7 +11,6 @@
#include <cmath> // for abs
#include <cstdint> // for uint64_t, int32_t, uint8_t, uint32_t
#include <cstring> // for size_t, strcmp, memcpy
#include <exception> // for exception
#include <iostream> // for operator<<, basic_ostream, basic_ostream::op...
#include <map> // for map, operator!=
#include <numeric> // for accumulate, partial_sum
@ -22,7 +21,6 @@
#include "../collective/communicator.h" // for Operation
#include "../common/algorithm.h" // for StableSort
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/common.h" // for Split
#include "../common/error_msg.h" // for GroupSize, GroupWeight, InfInData
#include "../common/group_data.h" // for ParallelGroupBuilder
#include "../common/io.h" // for PeekableInStream
@ -473,11 +471,11 @@ void MetaInfo::SetInfo(Context const& ctx, StringView key, StringView interface_
<< ", must have at least 1 column even if it's empty.";
auto const& first = get<Object const>(array.front());
auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
} else {
auto const& first = get<Object const>(j_interface);
auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
}
if (is_cuda) {
@ -567,46 +565,6 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
}
}
void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype,
size_t num) {
CHECK(key);
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface {
linalg::ArrayInterface(t)
};
assert(ArrayInterface<1>{interface}.is_contiguous);
return interface;
};
// Legacy code using XGBoost dtype, which is a small subset of array interface types.
switch (dtype) {
case xgboost::DataType::kFloat32: {
auto cast_ptr = reinterpret_cast<const float*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kDouble: {
auto cast_ptr = reinterpret_cast<const double*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt32: {
auto cast_ptr = reinterpret_cast<const uint32_t*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt64: {
auto cast_ptr = reinterpret_cast<const uint64_t*>(dptr);
this->SetInfoFromHost(ctx, key, proc(cast_ptr));
break;
}
default:
LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
}
}
void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const {
if (dtype == DataType::kFloat32) {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2021-2023, XGBoost contributors
* Copyright 2021-2024, XGBoost contributors
*/
#include "file_iterator.h"
@ -10,7 +10,10 @@
#include <ostream> // for operator<<, basic_ostream, istringstream
#include <vector> // for vector
#include "../common/common.h" // for Split
#include "../common/common.h" // for Split
#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
#include "xgboost/linalg.h"
#include "xgboost/logging.h" // for CHECK
#include "xgboost/string_view.h" // for operator<<, StringView
namespace xgboost::data {
@ -28,10 +31,10 @@ std::string ValidateFileFormat(std::string const& uri) {
for (size_t i = 0; i < arg_list.size(); ++i) {
std::istringstream is(arg_list[i]);
std::pair<std::string, std::string> kv;
CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
<< " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
<< " for value in arg " << i + 1;
CHECK(std::getline(is, kv.first, '='))
<< "Invalid uri argument format" << " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second))
<< "Invalid uri argument format" << " for value in arg " << i + 1;
args.insert(kv);
}
if (args.find("format") == args.cend()) {
@ -48,4 +51,41 @@ std::string ValidateFileFormat(std::string const& uri) {
return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
}
}
int FileIterator::Next() {
CHECK(parser_);
if (parser_->Next()) {
row_block_ = parser_->Value();
indptr_ = linalg::Make1dInterface(row_block_.offset, row_block_.size + 1);
values_ = linalg::Make1dInterface(row_block_.value, row_block_.offset[row_block_.size]);
indices_ = linalg::Make1dInterface(row_block_.index, row_block_.offset[row_block_.size]);
size_t n_columns =
*std::max_element(row_block_.index, row_block_.index + row_block_.offset[row_block_.size]);
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
// this condition and just add 1 to n_columns
n_columns += 1;
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(), values_.c_str(), n_columns);
if (row_block_.label) {
auto str = linalg::Make1dInterface(row_block_.label, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "label", str.c_str());
}
if (row_block_.qid) {
auto str = linalg::Make1dInterface(row_block_.qid, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "qid", str.c_str());
}
if (row_block_.weight) {
auto str = linalg::Make1dInterface(row_block_.weight, row_block_.size);
XGDMatrixSetInfoFromInterface(proxy_, "weight", str.c_str());
}
// Continue iteration
return true;
} else {
// Stop iteration
return false;
}
}
} // namespace xgboost::data

View File

@ -1,20 +1,16 @@
/**
* Copyright 2021-2023, XGBoost contributors
* Copyright 2021-2024, XGBoost contributors
*/
#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
#define XGBOOST_DATA_FILE_ITERATOR_H_
#include <algorithm> // for max_element
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t
#include <memory> // for unique_ptr
#include <string> // for string
#include <utility> // for move
#include "dmlc/data.h" // for RowBlock, Parser
#include "xgboost/c_api.h" // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
#include "xgboost/logging.h" // for CHECK
#include "xgboost/c_api.h" // for XGDMatrixFree, XGProxyDMatrixCreate
namespace xgboost::data {
[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
@ -53,41 +49,7 @@ class FileIterator {
XGDMatrixFree(proxy_);
}
int Next() {
CHECK(parser_);
if (parser_->Next()) {
row_block_ = parser_->Value();
using linalg::MakeVec;
indptr_ = ArrayInterfaceStr(MakeVec(row_block_.offset, row_block_.size + 1));
values_ = ArrayInterfaceStr(MakeVec(row_block_.value, row_block_.offset[row_block_.size]));
indices_ = ArrayInterfaceStr(MakeVec(row_block_.index, row_block_.offset[row_block_.size]));
size_t n_columns = *std::max_element(row_block_.index,
row_block_.index + row_block_.offset[row_block_.size]);
// dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
// this condition and just add 1 to n_columns
n_columns += 1;
XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
values_.c_str(), n_columns);
if (row_block_.label) {
XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
}
if (row_block_.qid) {
XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
}
if (row_block_.weight) {
XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
}
// Continue iteration
return true;
} else {
// Stop iteration
return false;
}
}
int Next();
auto Proxy() -> decltype(proxy_) { return proxy_; }

View File

@ -1,5 +1,5 @@
/**
* Copyright 2014-2023 by Contributors
* Copyright 2014-2024, XGBoost Contributors
* \file gbtree.cc
* \brief gradient boosted tree implementation.
* \author Tianqi Chen
@ -11,14 +11,12 @@
#include <algorithm>
#include <cstdint> // std::int32_t
#include <map>
#include <memory>
#include <numeric> // for iota
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "../common/common.h"
#include "../common/timer.h"
#include "../tree/param.h" // TrainParam
#include "gbtree_model.h"

View File

@ -10,15 +10,15 @@
#include <array>
#include <cmath>
#include <numeric> // for accumulate
#include "../collective/communicator-inl.h"
#include "../common/common.h" // MetricNoCache
#include "../common/common.h" // for AssertGPUSupport
#include "../common/math.h"
#include "../common/optional_weight.h" // OptionalWeights
#include "../common/pseudo_huber.h"
#include "../common/quantile_loss_utils.h" // QuantileLossParam
#include "../common/threading_utils.h"
#include "metric_common.h"
#include "metric_common.h" // MetricNoCache
#include "xgboost/collective/result.h" // for SafeColl
#include "xgboost/metric.h"

View File

@ -9,8 +9,6 @@
#include <string>
#include "../collective/aggregator.h"
#include "../collective/communicator-inl.h"
#include "../common/common.h"
#include "xgboost/metric.h"
namespace xgboost {

View File

@ -9,8 +9,8 @@
#include <array>
#include <atomic>
#include <cmath>
#include <numeric> // for accumulate
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/threading_utils.h"
#include "metric_common.h" // MetricNoCache

View File

@ -9,10 +9,9 @@
#include <array>
#include <memory>
#include <numeric> // for accumulate
#include <vector>
#include "../collective/communicator-inl.h"
#include "../common/math.h"
#include "../common/survival_util.h"
#include "../common/threading_utils.h"
#include "metric_common.h" // MetricNoCache

View File

@ -3,6 +3,8 @@
*/
#include <gtest/gtest.h>
#include <numeric> // for iota
#include "../../../src/collective/allreduce.h"
#include "../../../src/collective/coll.h" // for Coll
#include "../../../src/collective/tracker.h"

View File

@ -1,11 +1,12 @@
/**
* Copyright 2023, XGBoost Contributors
* Copyright 2023-2024, XGBoost Contributors
*/
#pragma once
#include <gtest/gtest.h>
#include <chrono> // for seconds
#include <cstdint> // for int32_t
#include <fstream> // for ifstream
#include <string> // for string
#include <thread> // for thread
#include <utility> // for move

View File

@ -1,10 +1,9 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <vector>
#include <string>
#include <utility>
#include "../../../src/common/hist_util.h"
#include "../../../src/data/gradient_index.h"
@ -135,7 +134,7 @@ TEST(CutsBuilder, SearchGroupInd) {
group[2] = 7;
group[3] = 5;
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), group.size()));
HistogramCuts hmat;
@ -348,7 +347,8 @@ void TestSketchFromWeights(bool with_group) {
for (size_t i = 0; i < kGroups; ++i) {
groups[i] = kRows / kGroups;
}
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
auto sg = linalg::Make1dInterface(groups.data(), kGroups);
info.SetInfo(ctx, "group", sg.c_str());
}
info.num_row_ = kRows;
@ -356,10 +356,10 @@ void TestSketchFromWeights(bool with_group) {
// Assign weights.
if (with_group) {
m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
}
m->SetInfo("weight", h_weights.data(), DataType::kFloat32, h_weights.size());
m->SetInfo("weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
m->Info().num_col_ = kCols;
m->Info().num_row_ = kRows;
ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <thrust/device_vector.h>
@ -682,7 +682,7 @@ TEST(HistUtil, DeviceSketchFromGroupWeights) {
for (size_t i = 0; i < kGroups; ++i) {
groups[i] = kRows / kGroups;
}
m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);
// sketch with no weight
@ -727,7 +727,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
for (size_t i = 0; i < kGroups; ++i) {
groups[i] = kRows / kGroups;
}
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
}
info.weights_.SetDevice(DeviceOrd::CUDA(0));
@ -746,10 +746,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
if (with_group) {
dmat->Info().SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
dmat->Info().SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
}
dmat->Info().SetInfo(ctx, "weight", h_weights.data(), DataType::kFloat32, h_weights.size());
dmat->Info().SetInfo(ctx, "weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
dmat->Info().num_col_ = kCols;
dmat->Info().num_row_ = kRows;
ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);

View File

@ -1,11 +1,12 @@
/**
* Copyright 2018-2023 by XGBoost Contributors
* Copyright 2018-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/base.h>
#include <xgboost/span.h>
#include <xgboost/host_device_vector.h>
#include <xgboost/span.h>
#include <numeric> // for iota
#include <vector>
#include "../../../src/common/transform.h"

View File

@ -1,10 +1,11 @@
/**
* Copyright 2021-2023, XGBoost Contributors
* Copyright 2021-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/host_device_vector.h>
#include "../helpers.h"
#include "../../../src/data/array_interface.h"
#include "../helpers.h"
namespace xgboost {

View File

@ -10,7 +10,6 @@
#include <memory>
#include <string>
#include "../../../src/common/version.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" // for GMockTHrow
#include "xgboost/base.h"
@ -23,23 +22,22 @@ TEST(MetaInfo, GetSet) {
double double2[2] = {1.0, 2.0};
EXPECT_EQ(info.labels.Size(), 0);
info.SetInfo(ctx, "label", double2, xgboost::DataType::kFloat32, 2);
info.SetInfo(ctx, "label", Make1dInterfaceTest(double2, 2));
EXPECT_EQ(info.labels.Size(), 2);
float float2[2] = {1.0f, 2.0f};
EXPECT_EQ(info.GetWeight(1), 1.0f)
<< "When no weights are given, was expecting default value 1";
info.SetInfo(ctx, "weight", float2, xgboost::DataType::kFloat32, 2);
EXPECT_EQ(info.GetWeight(1), 1.0f) << "When no weights are given, was expecting default value 1";
info.SetInfo(ctx, "weight", Make1dInterfaceTest(float2, 2));
EXPECT_EQ(info.GetWeight(1), 2.0f);
uint32_t uint32_t2[2] = {1U, 2U};
EXPECT_EQ(info.base_margin_.Size(), 0);
info.SetInfo(ctx, "base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(uint32_t2, 2));
EXPECT_EQ(info.base_margin_.Size(), 2);
uint64_t uint64_t2[2] = {1U, 2U};
EXPECT_EQ(info.group_ptr_.size(), 0);
info.SetInfo(ctx, "group", uint64_t2, xgboost::DataType::kUInt64, 2);
info.SetInfo(ctx, "group", Make1dInterfaceTest(uint64_t2, 2));
ASSERT_EQ(info.group_ptr_.size(), 3);
EXPECT_EQ(info.group_ptr_[2], 3);
@ -135,9 +133,9 @@ TEST(MetaInfo, SaveLoadBinary) {
};
std::vector<float> values (kRows);
std::generate(values.begin(), values.end(), generator);
info.SetInfo(ctx, "label", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo(ctx, "weight", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo(ctx, "base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
info.SetInfo(ctx, "label", Make1dInterfaceTest(values.data(), kRows));
info.SetInfo(ctx, "weight", Make1dInterfaceTest(values.data(), kRows));
info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(values.data(), kRows));
info.num_row_ = kRows;
info.num_col_ = kCols;
@ -271,7 +269,7 @@ TEST(MetaInfo, CPUQid) {
qid[i] = i;
}
info.SetInfo(ctx, "qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
info.SetInfo(ctx, "qid", Make1dInterfaceTest(qid.data(), info.num_row_));
ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
ASSERT_EQ(info.group_ptr_.front(), 0);
ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
@ -288,14 +286,12 @@ TEST(MetaInfo, Validate) {
info.num_col_ = 3;
std::vector<xgboost::bst_group_t> groups (11);
Context ctx;
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
std::vector<float> labels(info.num_row_ + 1);
EXPECT_THROW(
{
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
},
{ info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_ + 1)); },
dmlc::Error);
// Make overflow data, which can happen when users pass group structure as int
@ -305,13 +301,13 @@ TEST(MetaInfo, Validate) {
groups.push_back(1562500);
}
groups.push_back(static_cast<xgboost::bst_group_t>(-1));
EXPECT_THROW(info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size()),
EXPECT_THROW(info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size())),
dmlc::Error);
#if defined(XGBOOST_USE_CUDA)
info.group_ptr_.clear();
labels.resize(info.num_row_);
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_));
info.labels.SetDevice(FstCU());
EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
@ -340,8 +336,8 @@ TEST(MetaInfo, HostExtend) {
for (size_t g = 0; g < kRows / per_group; ++g) {
groups.emplace_back(per_group);
}
lhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
rhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
lhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
rhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
lhs.Extend(rhs, true, true);
ASSERT_EQ(lhs.num_row_, kRows * 2);

View File

@ -408,7 +408,7 @@ class Dart : public testing::TestWithParam<char const*> {
for (size_t i = 0; i < kRows; ++i) {
labels[i] = i % 2;
}
p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows);
p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kRows));
auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
learner->SetParam("booster", "dart");

View File

@ -1,8 +1,11 @@
/**
* Copyright 2020-2024, XGBoost contributors
*/
#include <xgboost/c_api.h>
#include "helpers.h"
#include "../../src/data/device_adapter.cuh"
#include "../../src/data/iterative_dmatrix.h"
#include "helpers.h"
namespace xgboost {

View File

@ -15,19 +15,18 @@
#include <cstdint> // std::int32_t
#include <cstdio>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "../../src/collective/communicator-inl.h"
#include "../../src/common/common.h"
#include "../../src/common/threading_utils.h"
#include "../../src/data/array_interface.h"
#include "filesystem.h" // dmlc::TemporaryDirectory
#include "xgboost/linalg.h"
#if !defined(_OPENMP)
#include <thread>
#endif
#if defined(__CUDACC__)
#define DeclareUnifiedTest(name) GPU ## name
@ -333,7 +332,7 @@ inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t nu
std::vector<float> x(n);
std::mt19937 rng(0);
std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
std::generate(x.begin(), x.end(), [&]() { return static_cast<float>(dist(rng)); });
// Make sure each category is present
for (size_t i = 0; i < num_categories; i++) {
x[i] = static_cast<decltype(x)::value_type>(i);
@ -494,6 +493,16 @@ inline int Next(DataIterHandle self) {
return static_cast<ArrayIterForTest*>(self)->Next();
}
/**
* @brief Create an array interface for host vector.
*/
template <typename T>
char const* Make1dInterfaceTest(T const* vec, std::size_t len) {
static thread_local std::string str;
str = linalg::Make1dInterface(vec, len);
return str.c_str();
}
class RMMAllocator;
using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);

View File

@ -5,10 +5,9 @@
#include <xgboost/json.h>
#include <xgboost/metric.h>
#include <map>
#include <memory>
#include <numeric> // for iota
#include "../../../src/common/linalg_op.h"
#include "../helpers.h"
namespace xgboost::metric {

View File

@ -1,14 +1,15 @@
/*!
* Copyright 2018-2023 XGBoost contributors
/**
* Copyright 2018-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/context.h>
#include <xgboost/objective.h>
#include "../../../src/objective/adaptive.h"
#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h"
#include <numeric> // for iota
#include "../../../src/objective/adaptive.h"
#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h"
#include "test_regression_obj.h"
namespace xgboost {

View File

@ -12,7 +12,6 @@
#include <cinttypes> // for int32_t, int64_t, uint32_t
#include <cstddef> // for size_t
#include <iosfwd> // for ofstream
#include <iterator> // for back_insert_iterator, back_inserter
#include <limits> // for numeric_limits
#include <map> // for map
#include <memory> // for unique_ptr, shared_ptr, __shared_ptr_...
@ -30,7 +29,6 @@
#include "../../src/common/random.h" // for GlobalRandom
#include "dmlc/io.h" // for Stream
#include "dmlc/omp.h" // for omp_get_max_threads
#include "dmlc/registry.h" // for Registry
#include "filesystem.h" // for TemporaryDirectory
#include "helpers.h" // for GetBaseScore, RandomDataGenerator
#include "objective_helpers.h" // for MakeObjNamesForTest, ObjTestNameGenerator
@ -103,9 +101,9 @@ TEST(Learner, CheckGroup) {
labels[i] = i % 2;
}
p_mat->SetInfo("weight", static_cast<void *>(weight.data()), DataType::kFloat32, kNumGroups);
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kNumRows);
p_mat->SetInfo("weight", Make1dInterfaceTest(weight.data(), kNumGroups));
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups));
p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kNumRows));
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_mat};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
@ -115,7 +113,7 @@ TEST(Learner, CheckGroup) {
group.resize(kNumGroups+1);
group[3] = 4;
group[4] = 1;
p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups+1);
p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups+1));
EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat));
}
@ -132,7 +130,7 @@ TEST(Learner, SLOW_CheckMultiBatch) { // NOLINT
for (size_t i = 0; i < num_row; ++i) {
labels[i] = i % 2;
}
dmat->SetInfo("label", labels.data(), DataType::kFloat32, num_row);
dmat->SetInfo("label", Make1dInterfaceTest(labels.data(), num_row));
std::vector<std::shared_ptr<DMatrix>> mat{dmat};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
learner->SetParams(Args{{"objective", "binary:logistic"}});

View File

@ -239,4 +239,18 @@ void TestAtomicAdd() {
TEST(Histogram, AtomicAddInt64) {
TestAtomicAdd();
}
TEST(Histogram, Quantiser) {
auto ctx = MakeCUDACtx(0);
std::size_t n_samples{16};
HostDeviceVector<GradientPair> gpair(n_samples, GradientPair{1.0, 1.0});
gpair.SetDevice(ctx.Device());
auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
for (auto v : gpair.ConstHostVector()) {
auto gh = quantiser.ToFloatingPoint(quantiser.ToFixedPoint(v));
ASSERT_EQ(gh.GetGrad(), 1.0);
ASSERT_EQ(gh.GetHess(), 1.0);
}
}
} // namespace xgboost::tree