temp merge, disable 1 line, SetValid

This commit is contained in:
Your Name
2023-10-12 16:16:44 -07:00
492 changed files with 15533 additions and 9376 deletions

View File

@@ -3,7 +3,7 @@
*/
#include "xgboost/c_api.h"
#include <algorithm> // for copy
#include <algorithm> // for copy, transform
#include <cinttypes> // for strtoimax
#include <cmath> // for nan
#include <cstring> // for strcmp
@@ -20,9 +20,12 @@
#include "../collective/communicator-inl.h" // for Allreduce, Broadcast, Finalize, GetProcessor...
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
#include "../common/linalg_op.h" // for ElementWiseTransformHost
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
#include "../data/ellpack_page.h" // for EllpackPage
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
#include "../data/simple_dmatrix.h" // for SimpleDMatrix
#include "c_api_error.h" // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
@@ -66,6 +69,7 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
}
}
static_assert(DMLC_CXX11_THREAD_LOCAL, "XGBoost depends on thread-local storage.");
using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
@@ -81,13 +85,6 @@ void XGBBuildInfoDevice(Json *p_info) {
} // namespace xgboost
#endif
namespace {
void DeprecatedFunc(StringView old, StringView since, StringView replacement) {
LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement
<< "` instead.";
}
} // anonymous namespace
XGB_DLL int XGBuildInfo(char const **out) {
API_BEGIN();
xgboost_CHECK_C_ARG_PTR(out);
@@ -328,7 +325,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
int nthread, int max_bin,
DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
LOG(WARNING) << error::DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
API_END();
@@ -432,7 +429,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
const bst_float *data, size_t nindptr, size_t nelem,
size_t num_col, DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
API_END();
@@ -465,8 +462,11 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
auto data_split_mode =
static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
*out = new std::shared_ptr<DMatrix>(
DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
API_END();
}
@@ -493,7 +493,7 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indi
const bst_float *data, size_t nindptr, size_t, size_t num_row,
DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
@@ -721,8 +721,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
API_END();
}
XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
xgboost::bst_ulong *out) {
XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, xgboost::bst_ulong *out) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = CastDMatrixHandle(handle);
@@ -731,8 +730,7 @@ XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
API_END();
}
XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle,
xgboost::bst_ulong *out) {
XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, xgboost::bst_ulong *out) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = CastDMatrixHandle(handle);
@@ -784,6 +782,104 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
API_END();
}
namespace {
template <typename Page>
void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
std::vector<std::uint64_t> *p_indptr, std::vector<float> *p_data) {
auto &indptr = *p_indptr;
auto &data = *p_data;
for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
auto const &cut = page.Cuts();
auto const &ptrs = cut.Ptrs();
indptr.resize(ptrs.size());
auto const &vals = cut.Values();
auto const &mins = cut.MinValues();
bst_feature_t n_features = p_m->Info().num_col_;
auto ft = p_m->Info().feature_types.ConstHostSpan();
std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
[](auto t) { return t == FeatureType::kCategorical; });
data.resize(vals.size() + n_features - n_categories); // |vals| + |mins|
std::size_t i{0}, n_numeric{0};
for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
CHECK_LT(i, data.size());
bool is_numeric = !common::IsCat(ft, fidx);
if (is_numeric) {
data[i] = mins[fidx];
i++;
}
auto beg = ptrs[fidx];
auto end = ptrs[fidx + 1];
CHECK_LE(end, data.size());
std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
i += (end - beg);
// shift by min values.
indptr[fidx] = ptrs[fidx] + n_numeric;
if (is_numeric) {
n_numeric++;
}
}
CHECK_EQ(n_numeric, n_features - n_categories);
indptr.back() = data.size();
CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
break;
}
}
} // namespace
XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
char const **out_indptr, char const **out_data) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = CastDMatrixHandle(handle);
xgboost_CHECK_C_ARG_PTR(config);
xgboost_CHECK_C_ARG_PTR(out_indptr);
xgboost_CHECK_C_ARG_PTR(out_data);
auto jconfig = Json::Load(StringView{config});
if (!p_m->PageExists<GHistIndexMatrix>() && !p_m->PageExists<EllpackPage>()) {
LOG(FATAL) << "The quantile cut hasn't been generated yet. Unless this is a `QuantileDMatrix`, "
"quantile cut is generated during training.";
}
// Get return buffer
auto &data = p_m->GetThreadLocal().ret_vec_float;
auto &indptr = p_m->GetThreadLocal().ret_vec_u64;
if (p_m->PageExists<GHistIndexMatrix>()) {
auto ctx = p_m->Ctx()->IsCPU() ? *p_m->Ctx() : p_m->Ctx()->MakeCPU();
GetCutImpl<GHistIndexMatrix>(&ctx, p_m, &indptr, &data);
} else {
auto ctx = p_m->Ctx()->IsCUDA() ? *p_m->Ctx() : p_m->Ctx()->MakeCUDA(0);
GetCutImpl<EllpackPage>(&ctx, p_m, &indptr, &data);
}
// Create a CPU context
Context ctx;
// Get return buffer
auto &ret_vec_str = p_m->GetThreadLocal().ret_vec_str;
ret_vec_str.clear();
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
linalg::MakeTensorView(&ctx, common::Span{indptr.data(), indptr.size()}, indptr.size())));
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
linalg::MakeTensorView(&ctx, common::Span{data.data(), data.size()}, data.size())));
auto &charp_vecs = p_m->GetThreadLocal().ret_vec_charp;
charp_vecs.resize(ret_vec_str.size());
std::transform(ret_vec_str.cbegin(), ret_vec_str.cend(), charp_vecs.begin(),
[](auto const &str) { return str.c_str(); });
*out_indptr = charp_vecs[0];
*out_data = charp_vecs[1];
API_END();
}
// xgboost implementation
XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[],
xgboost::bst_ulong len,
@@ -876,28 +972,71 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle,
API_END();
}
XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
DMatrixHandle dtrain,
bst_float *grad,
bst_float *hess,
xgboost::bst_ulong len) {
XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bst_float *grad,
bst_float *hess, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
HostDeviceVector<GradientPair> tmp_gpair;
auto* bst = static_cast<Learner*>(handle);
auto* dtr =
static_cast<std::shared_ptr<DMatrix>*>(dtrain);
tmp_gpair.Resize(len);
std::vector<GradientPair>& tmp_gpair_h = tmp_gpair.HostVector();
if (len > 0) {
xgboost_CHECK_C_ARG_PTR(grad);
xgboost_CHECK_C_ARG_PTR(hess);
}
for (xgboost::bst_ulong i = 0; i < len; ++i) {
tmp_gpair_h[i] = GradientPair(grad[i], hess[i]);
}
error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
auto *learner = static_cast<Learner *>(handle);
auto ctx = learner->Ctx()->MakeCPU();
bst->BoostOneIter(0, *dtr, &tmp_gpair);
auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
return XGBoosterTrainOneIter(handle, dtrain, 0, s_grad.c_str(), s_hess.c_str());
API_END();
}
namespace xgboost {
// copy user-supplied CUDA gradient arrays
void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
{
common::AssertGPUSupport();
}
#else
; // NOLINT
#endif
} // namespace xgboost
XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
char const *grad, char const *hess) {
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(grad);
xgboost_CHECK_C_ARG_PTR(hess);
auto p_fmat = CastDMatrixHandle(dtrain);
ArrayInterface<2, false> i_grad{StringView{grad}};
ArrayInterface<2, false> i_hess{StringView{hess}};
StringView msg{"Mismatched shape between the gradient and hessian."};
CHECK_EQ(i_grad.Shape(0), i_hess.Shape(0)) << msg;
CHECK_EQ(i_grad.Shape(1), i_hess.Shape(1)) << msg;
linalg::Matrix<GradientPair> gpair;
auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
CHECK_EQ(i_grad.Shape(0), p_fmat->Info().num_row_)
<< "Mismatched size between the gradient and training data.";
CHECK_EQ(grad_is_cuda, hess_is_cuda) << "gradient and hessian should be on the same device.";
auto *learner = static_cast<Learner *>(handle);
auto ctx = learner->Ctx();
if (!grad_is_cuda) {
gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1));
auto const shape = gpair.Shape();
auto h_gpair = gpair.HostView();
DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
common::ParallelFor(h_gpair.Size(), ctx->Threads(),
detail::CustomGradHessOp{t_grad, t_hess, h_gpair});
});
});
} else {
CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
}
learner->BoostOneIter(iter, p_fmat, &gpair);
API_END();
}
@@ -1025,7 +1164,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
const float **out_result) {
xgboost_CHECK_C_ARG_PTR(c_json_config);
auto config = Json::Load(StringView{c_json_config});
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
HostDeviceVector<float> *p_predt{nullptr};
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1044,6 +1182,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
xgboost_CHECK_C_ARG_PTR(out_dim);
CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
learner->BoostedRounds(), &shape, out_dim);
CHECK_GE(p_predt->Size(), n_samples);
xgboost_CHECK_C_ARG_PTR(out_result);
xgboost_CHECK_C_ARG_PTR(out_shape);
@@ -1126,12 +1265,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
return str;
};
if (common::FileExtension(fname) == "json") {
auto str = read_file();
Json in{Json::Load(StringView{str})};
auto buffer = read_file();
Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
static_cast<Learner*>(handle)->LoadModel(in);
} else if (common::FileExtension(fname) == "ubj") {
auto str = read_file();
Json in = Json::Load(StringView{str}, std::ios::binary);
auto buffer = read_file();
Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
static_cast<Learner *>(handle)->LoadModel(in);
} else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
@@ -1246,7 +1385,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l
raw_str.resize(0);
common::MemoryBufferStream fo(&raw_str);
DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
learner->Configure();
learner->SaveModel(&fo);

View File

@@ -1,8 +1,12 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
*/
#include "../common/api_entry.h" // XGBAPIThreadLocalEntry
#include <thrust/transform.h> // for transform
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/cuda_context.cuh" // for CUDAContext
#include "../common/threading_utils.h"
#include "../data/array_interface.h" // for DispatchDType, ArrayInterface
#include "../data/device_adapter.cuh"
#include "../data/proxy_dmatrix.h"
#include "c_api_error.h"
@@ -13,7 +17,6 @@
#include "xgboost/learner.h"
namespace xgboost {
void XGBBuildInfoDevice(Json *p_info) {
auto &info = *p_info;
@@ -72,6 +75,27 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
hipSetDevice(device_id_);
#endif
}
void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
ArrayInterface<2, false> const &hess,
linalg::Matrix<GradientPair> *out_gpair) {
auto grad_dev = dh::CudaGetPointerDevice(grad.data);
auto hess_dev = dh::CudaGetPointerDevice(hess.data);
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
auto &gpair = *out_gpair;
gpair.SetDevice(grad_dev);
gpair.Reshape(grad.Shape(0), grad.Shape(1));
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
auto cuctx = ctx->CUDACtx();
DispatchDType(grad, DeviceOrd::CUDA(grad_dev), [&](auto &&t_grad) {
DispatchDType(hess, DeviceOrd::CUDA(hess_dev), [&](auto &&t_hess) {
CHECK_EQ(t_grad.Size(), t_hess.Size());
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), t_grad.Size(),
detail::CustomGradHessOp{t_grad, t_hess, d_gpair});
});
});
}
} // namespace xgboost
using namespace xgboost; // NOLINT
@@ -109,7 +133,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
API_END();
}
int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
char const *c_json_config, std::shared_ptr<DMatrix> p_m,
xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
const float **out_result) {
@@ -124,7 +148,6 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
proxy->SetCUDAArray(c_array_interface);
auto config = Json::Load(StringView{c_json_config});
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
auto *learner = static_cast<Learner *>(handle);
HostDeviceVector<float> *p_predt{nullptr};
@@ -135,7 +158,10 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
RequiredArg<Integer>(config, "iteration_begin", __func__),
RequiredArg<Integer>(config, "iteration_end", __func__));
CHECK(p_predt);
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
if (learner->Ctx()->IsCUDA()) {
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
}
p_predt->SetDevice(proxy->DeviceIdx());
auto &shape = learner->GetThreadLocal().prediction_shape;
size_t n_samples = p_m->Info().num_row_;
@@ -163,7 +189,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
if (m) {
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
}
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
out_result);
}
@@ -176,6 +202,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
}
xgboost_CHECK_C_ARG_PTR(out_result);
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
out_result);
}

View File

@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2015-2022 by Contributors
/**
* Copyright 2015-2023, XGBoost Contributors
* \file c_api_error.h
* \brief Error handling for C API.
*/
@@ -35,8 +35,8 @@
} \
return 0; // NOLINT(*)
#define CHECK_HANDLE() if (handle == nullptr) \
LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
#define CHECK_HANDLE() \
if (handle == nullptr) ::xgboost::detail::EmptyHandle();
/*!
* \brief Set the last error message needed by C API

View File

@@ -7,8 +7,10 @@
#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory> // std::shared_ptr
#include <string>
#include <memory> // for shared_ptr
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector>
#include "xgboost/c_api.h"
@@ -16,7 +18,7 @@
#include "xgboost/feature_map.h" // for FeatureMap
#include "xgboost/json.h"
#include "xgboost/learner.h"
#include "xgboost/linalg.h" // ArrayInterfaceHandler
#include "xgboost/linalg.h" // ArrayInterfaceHandler, MakeTensorView, ArrayInterfaceStr
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // StringView
@@ -287,6 +289,19 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
}
namespace detail {
inline void EmptyHandle() {
LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
}
inline xgboost::Context const *BoosterCtx(BoosterHandle handle) {
if (handle == nullptr) {
EmptyHandle();
}
auto *learner = static_cast<xgboost::Learner *>(handle);
CHECK(learner);
return learner->Ctx();
}
template <typename PtrT, typename I, typename T>
void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
@@ -334,6 +349,40 @@ void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data
Json::Dump(jindices, indices_str);
Json::Dump(jdata, data_str);
}
/**
* @brief Make array interface for other language bindings.
*/
template <typename G, typename H>
auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, linalg::Order order,
std::size_t n_samples, std::size_t n_targets) {
auto t_grad = linalg::MakeTensorView(ctx, order, common::Span{grad, n_samples * n_targets},
n_samples, n_targets);
auto t_hess = linalg::MakeTensorView(ctx, order, common::Span{hess, n_samples * n_targets},
n_samples, n_targets);
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
return std::make_tuple(s_grad, s_hess);
}
template <typename G, typename H>
struct CustomGradHessOp {
linalg::MatrixView<G> t_grad;
linalg::MatrixView<H> t_hess;
linalg::MatrixView<GradientPair> d_gpair;
CustomGradHessOp(linalg::MatrixView<G> t_grad, linalg::MatrixView<H> t_hess,
linalg::MatrixView<GradientPair> d_gpair)
: t_grad{std::move(t_grad)}, t_hess{std::move(t_hess)}, d_gpair{std::move(d_gpair)} {}
XGBOOST_DEVICE void operator()(std::size_t i) {
auto [m, n] = linalg::UnravelIndex(i, t_grad.Shape(0), t_grad.Shape(1));
auto g = t_grad(m, n);
auto h = t_hess(m, n);
// from struct of arrays to array of structs.
d_gpair(m, n) = GradientPair{static_cast<float>(g), static_cast<float>(h)};
}
};
} // namespace detail
} // namespace xgboost
#endif // XGBOOST_C_API_C_API_UTILS_H_

View File

@@ -345,10 +345,10 @@ class CLI {
void LoadModel(std::string const& path, Learner* learner) const {
if (common::FileExtension(path) == "json") {
auto str = common::LoadSequentialFile(path);
CHECK_GT(str.size(), 2);
CHECK_EQ(str[0], '{');
Json in{Json::Load({str.c_str(), str.size()})};
auto buffer = common::LoadSequentialFile(path);
CHECK_GT(buffer.size(), 2);
CHECK_EQ(buffer[0], '{');
Json in{Json::Load({buffer.data(), buffer.size()})};
learner->LoadModel(in);
} else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
@@ -514,7 +514,9 @@ class CLI {
};
} // namespace xgboost
int main(int argc, char *argv[]) {
int main(int argc, char* argv[]) {
LOG(WARNING)
<< "The command line interface is deprecated and will be removed in future releases.";
try {
xgboost::CLI cli(argc, argv);
return cli.Run();

View File

@@ -0,0 +1,40 @@
/**
* Copyright 2023 by XGBoost contributors
*
* Higher level functions built on top the Communicator API, taking care of behavioral differences
* between row-split vs column-split distributed training, and horizontal vs vertical federated
* learning.
*/
#pragma once
#include <xgboost/data.h>
#include <limits>
#include <string>
#include <utility>
#include <vector>
#include "communicator-inl.cuh"
namespace xgboost {
namespace collective {
/**
* @brief Find the global sum of the given values across all workers.
*
* This only applies when the data is split row-wise (horizontally). When data is split
* column-wise (vertically), the original values are returned.
*
* @tparam T The type of the values.
* @param info MetaInfo about the DMatrix.
* @param device The device id.
* @param values Pointer to the inputs to sum.
* @param size Number of values to sum.
*/
template <typename T>
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
if (info.IsRowSplit()) {
collective::AllReduce<collective::Operation::kSum>(device, values, size);
}
}
} // namespace collective
} // namespace xgboost

View File

@@ -26,7 +26,6 @@ namespace collective {
* applied there, with the results broadcast to other workers.
*
* @tparam Function The function used to calculate the results.
* @tparam Args Arguments to the function.
* @param info MetaInfo about the DMatrix.
* @param buffer The buffer storing the results.
* @param size The size of the buffer.
@@ -57,6 +56,52 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
}
}
/**
* @brief Apply the given function where the labels are.
*
* Normally all the workers have access to the labels, so the function is just applied locally. In
* vertical federated learning, we assume labels are only available on worker 0, so the function is
* applied there, with the results broadcast to other workers.
*
* @tparam T Type of the HostDeviceVector storing the results.
* @tparam Function The function used to calculate the results.
* @param info MetaInfo about the DMatrix.
* @param result The HostDeviceVector storing the results.
* @param function The function used to calculate the results.
*/
template <typename T, typename Function>
void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
if (info.IsVerticalFederated()) {
// We assume labels are only available on worker 0, so the calculation is done there and result
// broadcast to other workers.
std::string message;
if (collective::GetRank() == 0) {
try {
std::forward<Function>(function)();
} catch (dmlc::Error& e) {
message = e.what();
}
}
collective::Broadcast(&message, 0);
if (!message.empty()) {
LOG(FATAL) << &message[0];
return;
}
std::size_t size{};
if (collective::GetRank() == 0) {
size = result->Size();
}
collective::Broadcast(&size, sizeof(std::size_t), 0);
result->Resize(size);
collective::Broadcast(result->HostPointer(), size * sizeof(T), 0);
} else {
std::forward<Function>(function)();
}
}
/**
* @brief Find the global max of the given value across all workers.
*

View File

@@ -57,6 +57,20 @@ inline void AllReduce(int device, double *send_receive_buffer, size_t count) {
Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
}
/**
* @brief Gather values from all all processes.
*
* This assumes all ranks have the same size.
*
* @param send_buffer Buffer storing the data to be sent.
* @param receive_buffer Buffer storing the gathered data.
* @param send_size Size of the sent data in bytes.
*/
inline void AllGather(int device, void const *send_buffer, void *receive_buffer,
std::size_t send_size) {
Communicator::GetDevice(device)->AllGather(send_buffer, receive_buffer, send_size);
}
/**
* @brief Gather variable-length values from all processes.
* @param device ID of the device.

View File

@@ -41,7 +41,8 @@ void Communicator::Init(Json const& config) {
#endif
break;
}
case CommunicatorType::kInMemory: {
case CommunicatorType::kInMemory:
case CommunicatorType::kInMemoryNccl: {
communicator_.reset(InMemoryCommunicator::Create(config));
break;
}

View File

@@ -29,13 +29,22 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
old_device_ordinal = device_ordinal;
old_world_size = communicator_->GetWorldSize();
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
if (type_ != CommunicatorType::kFederated) {
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
} else {
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
switch (type_) {
case CommunicatorType::kRabit:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
break;
case CommunicatorType::kFederated:
case CommunicatorType::kInMemory:
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
break;
case CommunicatorType::kInMemoryNccl:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
break;
default:
device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
}
#else
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
#endif
}
return device_communicator_.get();

View File

@@ -69,7 +69,7 @@ enum class Operation {
class DeviceCommunicator;
enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory };
enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory, kInMemoryNccl };
/** \brief Case-insensitive string comparison. */
inline int CompareStringsCaseInsensitive(const char *s1, const char *s2) {
@@ -220,6 +220,8 @@ class Communicator {
result = CommunicatorType::kFederated;
} else if (!CompareStringsCaseInsensitive("in-memory", str)) {
result = CommunicatorType::kInMemory;
} else if (!CompareStringsCaseInsensitive("in-memory-nccl", str)) {
result = CommunicatorType::kInMemoryNccl;
} else {
LOG(FATAL) << "Unknown communicator type " << str;
}

View File

@@ -27,6 +27,17 @@ class DeviceCommunicator {
virtual void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
Operation op) = 0;
/**
* @brief Gather values from all all processes.
*
* This assumes all ranks have the same size.
*
* @param send_buffer Buffer storing the data to be sent.
* @param receive_buffer Buffer storing the gathered data.
* @param send_size Size of the sent data in bytes.
*/
virtual void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) = 0;
/**
* @brief Gather variable-length values from all processes.
* @param send_buffer Buffer storing the input data.

View File

@@ -11,21 +11,18 @@ namespace collective {
class DeviceCommunicatorAdapter : public DeviceCommunicator {
public:
DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
: device_ordinal_{device_ordinal}, communicator_{communicator} {
explicit DeviceCommunicatorAdapter(int device_ordinal)
: device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
if (device_ordinal_ < 0) {
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
}
if (communicator_ == nullptr) {
LOG(FATAL) << "Communicator cannot be null.";
}
}
~DeviceCommunicatorAdapter() override = default;
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
Operation op) override {
if (communicator_->GetWorldSize() == 1) {
if (world_size_ == 1) {
return;
}
@@ -35,62 +32,82 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
dh::safe_cuda(hipSetDevice(device_ordinal_));
#endif
auto size = count * GetTypeSize(data_type);
host_buffer_.reserve(size);
host_buffer_.resize(size);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
Allreduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
AllReduce(host_buffer_.data(), count, data_type, op);
dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
#endif
}
void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
if (world_size_ == 1) {
return;
}
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
cudaMemcpyDefault));
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
hipMemcpyDefault));
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
#endif
}
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
dh::caching_device_vector<char> *receive_buffer) override {
if (communicator_->GetWorldSize() == 1) {
if (world_size_ == 1) {
return;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#else
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_ordinal_));
#endif
int const world_size = communicator_->GetWorldSize();
int const rank = communicator_->GetRank();
segments->clear();
segments->resize(world_size, 0);
segments->at(rank) = length_bytes;
communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
Operation::kMax);
segments->resize(world_size_, 0);
segments->at(rank_) = length_bytes;
Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
receive_buffer->resize(total_bytes);
host_buffer_.reserve(total_bytes);
host_buffer_.resize(total_bytes);
size_t offset = 0;
for (int32_t i = 0; i < world_size; ++i) {
for (int32_t i = 0; i < world_size_; ++i) {
size_t as_bytes = segments->at(i);
if (i == rank) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
hipMemcpyDefault));
#else
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
if (i == rank_) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
hipMemcpyDefault));
#endif
}
communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
Broadcast(host_buffer_.data() + offset, as_bytes, i);
offset += as_bytes;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
hipMemcpyDefault));
#else
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
cudaMemcpyDefault));
#endif
@@ -102,7 +119,8 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
private:
int const device_ordinal_;
Communicator *communicator_;
int const world_size_;
int const rank_;
/// Host buffer used to call communicator functions.
std::vector<char> host_buffer_{};
};

View File

@@ -0,0 +1,229 @@
/*!
* Copyright 2023 XGBoost contributors
*/
#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
#include "nccl_device_communicator.cuh"
namespace xgboost {
namespace collective {
NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
: device_ordinal_{device_ordinal},
needs_sync_{needs_sync},
world_size_{GetWorldSize()},
rank_{GetRank()} {
if (device_ordinal_ < 0) {
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
}
if (world_size_ == 1) {
return;
}
std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
GetCudaUUID(s_this_uuid);
// TODO(rongou): replace this with allgather.
Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
size_t j = 0;
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
j++;
}
auto iter = std::unique(converted.begin(), converted.end());
auto n_uniques = std::distance(converted.begin(), iter);
CHECK_EQ(n_uniques, world_size_)
<< "Multiple processes within communication group running on same CUDA "
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
nccl_unique_id_ = GetUniqueId();
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
}
NcclDeviceCommunicator::~NcclDeviceCommunicator() {
if (world_size_ == 1) {
return;
}
if (nccl_comm_) {
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
}
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
LOG(CONSOLE) << "======== NCCL Statistics========";
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
}
}
namespace {
ncclDataType_t GetNcclDataType(DataType const &data_type) {
ncclDataType_t result{ncclInt8};
switch (data_type) {
case DataType::kInt8:
result = ncclInt8;
break;
case DataType::kUInt8:
result = ncclUint8;
break;
case DataType::kInt32:
result = ncclInt32;
break;
case DataType::kUInt32:
result = ncclUint32;
break;
case DataType::kInt64:
result = ncclInt64;
break;
case DataType::kUInt64:
result = ncclUint64;
break;
case DataType::kFloat:
result = ncclFloat;
break;
case DataType::kDouble:
result = ncclDouble;
break;
default:
LOG(FATAL) << "Unknown data type.";
}
return result;
}
bool IsBitwiseOp(Operation const &op) {
return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
op == Operation::kBitwiseXOR;
}
ncclRedOp_t GetNcclRedOp(Operation const &op) {
ncclRedOp_t result{ncclMax};
switch (op) {
case Operation::kMax:
result = ncclMax;
break;
case Operation::kMin:
result = ncclMin;
break;
case Operation::kSum:
result = ncclSum;
break;
default:
LOG(FATAL) << "Unsupported reduce operation.";
}
return result;
}
template <typename Func>
void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
std::size_t size) {
dh::LaunchN(size, [=] __device__(std::size_t idx) {
auto result = device_buffer[idx];
for (auto rank = 1; rank < world_size; rank++) {
result = func(result, device_buffer[rank * size + idx]);
}
out_buffer[idx] = result;
});
}
} // anonymous namespace
void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
DataType data_type, Operation op) {
auto const size = count * GetTypeSize(data_type);
dh::caching_device_vector<char> buffer(size * world_size_);
auto *device_buffer = buffer.data().get();
// First gather data from all the workers.
dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
nccl_comm_, dh::DefaultStream()));
if (needs_sync_) {
dh::DefaultStream().Sync();
}
// Then reduce locally.
auto *out_buffer = static_cast<char *>(send_receive_buffer);
switch (op) {
case Operation::kBitwiseAND:
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size);
break;
case Operation::kBitwiseOR:
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size);
break;
case Operation::kBitwiseXOR:
RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size);
break;
default:
LOG(FATAL) << "Not a bitwise reduce operation.";
}
}
void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
DataType data_type, Operation op) {
if (world_size_ == 1) {
return;
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
if (IsBitwiseOp(op)) {
BitwiseAllReduce(send_receive_buffer, count, data_type, op);
} else {
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
dh::DefaultStream()));
}
allreduce_bytes_ += count * GetTypeSize(data_type);
allreduce_calls_ += 1;
}
void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_buffer,
std::size_t send_size) {
if (world_size_ == 1) {
return;
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
dh::DefaultStream()));
}
void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
std::vector<std::size_t> *segments,
dh::caching_device_vector<char> *receive_buffer) {
if (world_size_ == 1) {
return;
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
segments->clear();
segments->resize(world_size_, 0);
segments->at(rank_) = length_bytes;
Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
receive_buffer->resize(total_bytes);
size_t offset = 0;
dh::safe_nccl(ncclGroupStart());
for (int32_t i = 0; i < world_size_; ++i) {
size_t as_bytes = segments->at(i);
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
ncclChar, i, nccl_comm_, dh::DefaultStream()));
offset += as_bytes;
}
dh::safe_nccl(ncclGroupEnd());
}
void NcclDeviceCommunicator::Synchronize() {
if (world_size_ == 1) {
return;
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::DefaultStream().Sync();
}
} // namespace collective
} // namespace xgboost
#endif

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2022 XGBoost contributors
* Copyright 2022-2023 XGBoost contributors
*/
#pragma once
@@ -12,136 +12,27 @@ namespace collective {
class NcclDeviceCommunicator : public DeviceCommunicator {
public:
NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
: device_ordinal_{device_ordinal}, communicator_{communicator} {
if (device_ordinal_ < 0) {
LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
}
if (communicator_ == nullptr) {
LOG(FATAL) << "Communicator cannot be null.";
}
int32_t const rank = communicator_->GetRank();
int32_t const world = communicator_->GetWorldSize();
if (world == 1) {
return;
}
std::vector<uint64_t> uuids(world * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
GetCudaUUID(s_this_uuid);
// TODO(rongou): replace this with allgather.
communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
size_t j = 0;
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
j++;
}
auto iter = std::unique(converted.begin(), converted.end());
auto n_uniques = std::distance(converted.begin(), iter);
CHECK_EQ(n_uniques, world)
<< "Multiple processes within communication group running on same CUDA "
<< "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
nccl_unique_id_ = GetUniqueId();
dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipStreamCreate(&cuda_stream_));
#else
dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
#endif
}
~NcclDeviceCommunicator() override {
if (communicator_->GetWorldSize() == 1) {
return;
}
if (cuda_stream_) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipStreamDestroy(cuda_stream_));
#else
dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
#endif
}
if (nccl_comm_) {
dh::safe_nccl(ncclCommDestroy(nccl_comm_));
}
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
LOG(CONSOLE) << "======== NCCL Statistics========";
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
}
}
/**
* @brief Construct a new NCCL communicator.
* @param device_ordinal The GPU device id.
* @param needs_sync Whether extra CUDA stream synchronization is needed.
*
* In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
* a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
* makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
*
* The Rabit communicator runs with one process per GPU, so the additional synchronization is not
* needed. The in-memory communicator is used in tests with multiple threads, each thread
* representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
*/
explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
~NcclDeviceCommunicator() override;
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
Operation op) override {
if (communicator_->GetWorldSize() == 1) {
return;
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
cuda_stream_));
allreduce_bytes_ += count * GetTypeSize(data_type);
allreduce_calls_ += 1;
}
Operation op) override;
void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override;
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
dh::caching_device_vector<char> *receive_buffer) override {
if (communicator_->GetWorldSize() == 1) {
return;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
#else
dh::safe_cuda(cudaSetDevice(device_ordinal_));
#endif
int const world_size = communicator_->GetWorldSize();
int const rank = communicator_->GetRank();
segments->clear();
segments->resize(world_size, 0);
segments->at(rank) = length_bytes;
communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
Operation::kMax);
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
receive_buffer->resize(total_bytes);
size_t offset = 0;
dh::safe_nccl(ncclGroupStart());
for (int32_t i = 0; i < world_size; ++i) {
size_t as_bytes = segments->at(i);
dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
ncclChar, i, nccl_comm_, cuda_stream_));
offset += as_bytes;
}
dh::safe_nccl(ncclGroupEnd());
}
void Synchronize() override {
if (communicator_->GetWorldSize() == 1) {
return;
}
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_ordinal_));
dh::safe_cuda(hipStreamSynchronize(cuda_stream_));
#else
dh::safe_cuda(cudaSetDevice(device_ordinal_));
dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
#endif
}
dh::caching_device_vector<char> *receive_buffer) override;
void Synchronize() override;
private:
static constexpr std::size_t kUuidLength =
@@ -182,79 +73,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
ncclUniqueId GetUniqueId() {
static const int kRootRank = 0;
ncclUniqueId id;
if (communicator_->GetRank() == kRootRank) {
if (rank_ == kRootRank) {
dh::safe_nccl(ncclGetUniqueId(&id));
}
communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
static_cast<int>(kRootRank));
Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
return id;
}
static ncclDataType_t GetNcclDataType(DataType const &data_type) {
ncclDataType_t result;
switch (data_type) {
case DataType::kInt8:
result = ncclInt8;
break;
case DataType::kUInt8:
result = ncclUint8;
break;
case DataType::kInt32:
result = ncclInt32;
break;
case DataType::kUInt32:
result = ncclUint32;
break;
case DataType::kInt64:
result = ncclInt64;
break;
case DataType::kUInt64:
result = ncclUint64;
break;
case DataType::kFloat:
result = ncclFloat;
break;
case DataType::kDouble:
result = ncclDouble;
break;
default:
LOG(FATAL) << "Unknown data type.";
}
return result;
}
static ncclRedOp_t GetNcclRedOp(Operation const &op) {
ncclRedOp_t result;
switch (op) {
case Operation::kMax:
result = ncclMax;
break;
case Operation::kMin:
result = ncclMin;
break;
case Operation::kSum:
result = ncclSum;
break;
case Operation::kBitwiseAND:
case Operation::kBitwiseOR:
case Operation::kBitwiseXOR:
LOG(FATAL) << "Not implemented yet.";
default:
LOG(FATAL) << "Unknown reduce operation.";
}
return result;
}
void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
Operation op);
int const device_ordinal_;
Communicator *communicator_;
bool const needs_sync_;
int const world_size_;
int const rank_;
ncclComm_t nccl_comm_{};
#if defined(XGBOOST_USE_HIP)
hipStream_t cuda_stream_{};
#else
cudaStream_t cuda_stream_{};
#endif
ncclUniqueId nccl_unique_id_{};
size_t allreduce_bytes_{0}; // Keep statistics of the number of bytes communicated.
size_t allreduce_calls_{0}; // Keep statistics of the number of reduce calls.

View File

@@ -1,19 +1,22 @@
/*!
* Copyright (c) 2022 by XGBoost Contributors
/**
* Copyright 2022-2023 by XGBoost Contributors
*/
#include "xgboost/collective/socket.h"
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <cstring> // std::memcpy, std::memset
#include <filesystem> // for path
#include <system_error> // std::error_code, std::system_category
#include "rabit/internal/socket.h" // for PollHelper
#include "xgboost/collective/result.h" // for Result
#if defined(__unix__) || defined(__APPLE__)
#include <netdb.h> // getaddrinfo, freeaddrinfo
#endif // defined(__unix__) || defined(__APPLE__)
namespace xgboost {
namespace collective {
namespace xgboost::collective {
SockAddress MakeSockAddress(StringView host, in_port_t port) {
struct addrinfo hints;
std::memset(&hints, 0, sizeof(hints));
@@ -71,7 +74,12 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
return bytes;
}
std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
[[nodiscard]] Result Connect(xgboost::StringView host, std::int32_t port, std::int32_t retry,
std::chrono::seconds timeout,
xgboost::collective::TCPSocket *out_conn) {
auto addr = MakeSockAddress(xgboost::StringView{host}, port);
auto &conn = *out_conn;
sockaddr const *addr_handle{nullptr};
socklen_t addr_len{0};
if (addr.IsV4()) {
@@ -81,14 +89,67 @@ std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
addr_handle = reinterpret_cast<const sockaddr *>(&addr.V6().Handle());
addr_len = sizeof(addr.V6().Handle());
}
auto socket = TCPSocket::Create(addr.Domain());
CHECK_EQ(static_cast<std::int32_t>(socket.Domain()), static_cast<std::int32_t>(addr.Domain()));
auto rc = connect(socket.Handle(), addr_handle, addr_len);
if (rc != 0) {
return std::error_code{errno, std::system_category()};
conn = TCPSocket::Create(addr.Domain());
CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
conn.SetNonBlock(true);
Result last_error;
auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
last_error = std::move(err);
LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
<< "): Failed to connect to:" << host << " Error:" << last_error.Report();
};
for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
if (attempt > 0) {
LOG(WARNING) << "Retrying connection to " << host << " for the " << attempt << " time.";
#if defined(_MSC_VER) || defined(__MINGW32__)
Sleep(attempt << 1);
#else
sleep(attempt << 1);
#endif
}
auto rc = connect(conn.Handle(), addr_handle, addr_len);
if (rc != 0) {
auto errcode = system::LastError();
if (!system::ErrorWouldBlock(errcode)) {
log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
__FILE__, __LINE__);
continue;
}
rabit::utils::PollHelper poll;
poll.WatchWrite(conn);
auto result = poll.Poll(timeout);
if (!result.OK()) {
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
if (!poll.CheckWrite(conn)) {
log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
__FILE__, __LINE__);
continue;
}
result = conn.GetSockError();
if (!result.OK()) {
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
conn.SetNonBlock(false);
return Success();
} else {
conn.SetNonBlock(false);
return Success();
}
}
*out = std::move(socket);
return std::make_error_code(std::errc{});
std::stringstream ss;
ss << "Failed to connect to " << host << ":" << port;
conn.Close();
return Fail(ss.str(), std::move(last_error));
}
} // namespace collective
} // namespace xgboost
} // namespace xgboost::collective

View File

@@ -188,7 +188,7 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
#else
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
#endif

View File

@@ -24,6 +24,8 @@ struct XGBAPIThreadLocalEntry {
std::vector<const char *> ret_vec_charp;
/*! \brief returning float vector. */
std::vector<float> ret_vec_float;
/*! \brief returning uint vector. */
std::vector<std::uint64_t> ret_vec_u64;
/*! \brief temp variable of gradient pairs. */
std::vector<GradientPair> tmp_gpair;
/*! \brief Temp variable for returning prediction result. */

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019 by Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
* \file bitfield.h
*/
#ifndef XGBOOST_COMMON_BITFIELD_H_
@@ -54,14 +54,17 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
}
#endif // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
/*!
* \brief A non-owning type with auxiliary methods defined for manipulating bits.
/**
* @brief A non-owning type with auxiliary methods defined for manipulating bits.
*
* \tparam Direction Whether the bits start from left or from right.
* @tparam VT Underlying value type, must be an unsigned integer.
* @tparam Direction Whether the bits start from left or from right.
* @tparam IsConst Whether the view is const.
*/
template <typename VT, typename Direction, bool IsConst = false>
struct BitFieldContainer {
using value_type = std::conditional_t<IsConst, VT const, VT>; // NOLINT
using size_type = size_t; // NOLINT
using index_type = size_t; // NOLINT
using pointer = value_type*; // NOLINT
@@ -74,8 +77,9 @@ struct BitFieldContainer {
};
private:
common::Span<value_type> bits_;
static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
value_type* bits_{nullptr};
size_type n_values_{0};
static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
public:
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -90,13 +94,15 @@ struct BitFieldContainer {
public:
BitFieldContainer() = default;
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
: bits_{bits.data()}, n_values_{bits.size()} {}
BitFieldContainer(BitFieldContainer const& other) = default;
BitFieldContainer(BitFieldContainer&& other) = default;
BitFieldContainer &operator=(BitFieldContainer const &that) = default;
BitFieldContainer &operator=(BitFieldContainer &&that) = default;
XGBOOST_DEVICE common::Span<value_type> Bits() { return bits_; }
XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }
XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }
/*\brief Compute the size of needed memory allocation. The returned value is in terms
* of number of elements with `BitFieldContainer::value_type'.
@@ -107,17 +113,17 @@ struct BitFieldContainer {
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t min_size = min(bits_.size(), rhs.bits_.size());
size_t min_size = min(NumValues(), rhs.NumValues());
if (tid < min_size) {
bits_[tid] |= rhs.bits_[tid];
Data()[tid] |= rhs.Data()[tid];
}
return *this;
}
#else
BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
size_t min_size = std::min(NumValues(), rhs.NumValues());
for (size_t i = 0; i < min_size; ++i) {
bits_[i] |= rhs.bits_[i];
Data()[i] |= rhs.Data()[i];
}
return *this;
}
@@ -125,75 +131,85 @@ struct BitFieldContainer {
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
__device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
size_t min_size = min(bits_.size(), rhs.bits_.size());
size_t min_size = min(NumValues(), rhs.NumValues());
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < min_size) {
bits_[tid] &= rhs.bits_[tid];
Data()[tid] &= rhs.Data()[tid];
}
return *this;
}
#else
BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
size_t min_size = std::min(bits_.size(), rhs.bits_.size());
size_t min_size = std::min(NumValues(), rhs.NumValues());
for (size_t i = 0; i < min_size; ++i) {
bits_[i] &= rhs.bits_[i];
Data()[i] &= rhs.Data()[i];
}
return *this;
}
#endif // defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
__device__ auto Set(index_type pos) {
__device__ auto Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type set_bit = kOne << pos_v.bit_pos;
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicOr(reinterpret_cast<Type *>(&value), set_bit);
}
__device__ void Clear(index_type pos) {
__device__ void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
}
#else
void Set(index_type pos) {
void Set(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type set_bit = kOne << pos_v.bit_pos;
value |= set_bit;
}
void Clear(index_type pos) {
void Clear(index_type pos) noexcept(true) {
Pos pos_v = Direction::Shift(ToBitPos(pos));
value_type& value = bits_[pos_v.int_pos];
value_type& value = Data()[pos_v.int_pos];
value_type clear_bit = ~(kOne << pos_v.bit_pos);
value &= clear_bit;
}
#endif // defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
XGBOOST_DEVICE bool Check(Pos pos_v) const {
XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
pos_v = Direction::Shift(pos_v);
SPAN_LT(pos_v.int_pos, bits_.size());
value_type const value = bits_[pos_v.int_pos];
assert(pos_v.int_pos < NumValues());
value_type const value = Data()[pos_v.int_pos];
value_type const test_bit = kOne << pos_v.bit_pos;
value_type result = test_bit & value;
return static_cast<bool>(result);
}
XGBOOST_DEVICE bool Check(index_type pos) const {
[[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
Pos pos_v = ToBitPos(pos);
return Check(pos_v);
}
/**
* @brief Returns the total number of bits that can be viewed. This is equal to or
* larger than the acutal number of valid bits.
*/
[[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
return kValueSize * NumValues();
}
/**
* @brief Number of storage unit used in this bit field.
*/
[[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }
XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }
XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }
XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
inline friend std::ostream &
operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
os << "Bits " << "storage size: " << field.bits_.size() << "\n";
for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
inline friend std::ostream& operator<<(std::ostream& os,
BitFieldContainer<VT, Direction, IsConst> field) {
os << "Bits "
<< "storage size: " << field.NumValues() << "\n";
for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
os << bset << "\n";
}
return os;

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2020-2022 by XGBoost Contributors
/**
* Copyright 2020-2023, XGBoost Contributors
* \file categorical.h
*/
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -10,7 +10,6 @@
#include "bitfield.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/parameter.h"
#include "xgboost/span.h"
namespace xgboost {
@@ -53,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
*
* Go to left if it's NOT the matching category, which matches one-hot encoding.
*/
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
KCatBitField const s_cats(cats);
if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
return true;

View File

@@ -1,16 +1,27 @@
/*!
* Copyright 2017-2022 by XGBoost Contributors
/**
* Copyright 2017-2023, XGBoost Contributors
* \brief Utility for fast column-wise access
*/
#include "column_matrix.h"
namespace xgboost {
namespace common {
#include <algorithm> // for transform
#include <cstddef> // for size_t
#include <cstdint> // for uint64_t, uint8_t
#include <limits> // for numeric_limits
#include <type_traits> // for remove_reference_t
#include <vector> // for vector
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "xgboost/base.h" // for bst_feaature_t
#include "xgboost/span.h" // for Span
namespace xgboost::common {
void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
auto const nfeature = gmat.Features();
const size_t nrow = gmat.Size();
// identify type of each column
type_.resize(nfeature);
type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});
uint32_t max_val = std::numeric_limits<uint32_t>::max();
for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -34,7 +45,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
// want to compute storage boundary for each feature
// using variants of prefix sum scan
feature_offsets_.resize(nfeature + 1);
feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
size_t accum_index = 0;
feature_offsets_[0] = accum_index;
for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -49,9 +60,11 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
SetTypeSize(gmat.MaxNumBinPerFeat());
auto storage_size =
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
index_.resize(storage_size, 0);
index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
if (!all_dense_column) {
row_ind_.resize(feature_offsets_[nfeature]);
row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
}
// store least bin id for each feature
@@ -59,7 +72,51 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
any_missing_ = !gmat.IsDense();
missing_flags_.clear();
missing_ = MissingIndicator{0, false};
}
} // namespace common
} // namespace xgboost
// IO procedures for external memory.
bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
if (!common::ReadVec(fi, &index_)) {
return false;
}
if (!common::ReadVec(fi, &type_)) {
return false;
}
if (!common::ReadVec(fi, &row_ind_)) {
return false;
}
if (!common::ReadVec(fi, &feature_offsets_)) {
return false;
}
if (!common::ReadVec(fi, &missing_.storage)) {
return false;
}
missing_.InitView();
index_base_ = index_base;
if (!fi->Read(&bins_type_size_)) {
return false;
}
if (!fi->Read(&any_missing_)) {
return false;
}
return true;
}
std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
std::size_t bytes{0};
bytes += common::WriteVec(fo, index_);
bytes += common::WriteVec(fo, type_);
bytes += common::WriteVec(fo, row_ind_);
bytes += common::WriteVec(fo, feature_offsets_);
bytes += common::WriteVec(fo, missing_.storage);
bytes += fo->Write(bins_type_size_);
bytes += fo->Write(any_missing_);
return bytes;
}
} // namespace xgboost::common

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2017-2022 by Contributors
/**
* Copyright 2017-2023, XGBoost Contributors
* \file column_matrix.h
* \brief Utility for fast column-wise access
* \author Philip Cho
@@ -8,25 +8,30 @@
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
#include <dmlc/endian.h>
#include <algorithm>
#include <cstddef> // for size_t, byte
#include <cstdint> // for uint8_t
#include <limits>
#include <memory>
#include <utility> // std::move
#include <vector>
#include <type_traits> // for enable_if_t, is_same_v, is_signed_v
#include <utility> // for move
#include "../data/adapter.h"
#include "../data/gradient_index.h"
#include "algorithm.h"
#include "bitfield.h" // for RBitField8
#include "hist_util.h"
#include "ref_resource_view.h" // for RefResourceView
#include "xgboost/base.h" // for bst_bin_t
#include "xgboost/span.h" // for Span
namespace xgboost {
namespace common {
namespace xgboost::common {
class ColumnMatrix;
class AlignedFileWriteStream;
class AlignedResourceReadStream;
/*! \brief column type */
enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };
enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };
/*! \brief a column storage, to be used with ApplySplit. Note that each
bin id is stored as index[i] + index_base.
@@ -41,12 +46,12 @@ class Column {
: index_(index), index_base_(least_bin_idx) {}
virtual ~Column() = default;
bst_bin_t GetGlobalBinIdx(size_t idx) const {
[[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
return index_base_ + static_cast<bst_bin_t>(index_[idx]);
}
/* returns number of elements in column */
size_t Size() const { return index_.size(); }
[[nodiscard]] size_t Size() const { return index_.size(); }
private:
/* bin indexes in range [0, max_bins - 1] */
@@ -63,7 +68,7 @@ class SparseColumnIter : public Column<BinIdxT> {
common::Span<const size_t> row_ind_;
size_t idx_;
size_t const* RowIndices() const { return row_ind_.data(); }
[[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }
public:
SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -81,7 +86,7 @@ class SparseColumnIter : public Column<BinIdxT> {
SparseColumnIter(SparseColumnIter const&) = delete;
SparseColumnIter(SparseColumnIter&&) = default;
size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
[[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
bst_bin_t operator[](size_t rid) {
const size_t column_size = this->Size();
if (!((idx_) < column_size)) {
@@ -101,25 +106,28 @@ class SparseColumnIter : public Column<BinIdxT> {
}
};
/**
* @brief Column stored as a dense vector. It might still contain missing values as
* indicated by the missing flags.
*/
template <typename BinIdxT, bool any_missing>
class DenseColumnIter : public Column<BinIdxT> {
public:
using ByteType = bool;
private:
using Base = Column<BinIdxT>;
/* flags for missing values in dense columns */
std::vector<ByteType> const& missing_flags_;
LBitField32 missing_flags_;
size_t feature_offset_;
public:
explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
std::vector<ByteType> const& missing_flags, size_t feature_offset)
LBitField32 missing_flags, size_t feature_offset)
: Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
DenseColumnIter(DenseColumnIter const&) = delete;
DenseColumnIter(DenseColumnIter&&) = default;
bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }
[[nodiscard]] bool IsMissing(size_t ridx) const {
return missing_flags_.Check(feature_offset_ + ridx);
}
bst_bin_t operator[](size_t ridx) const {
if (any_missing) {
@@ -131,12 +139,64 @@ class DenseColumnIter : public Column<BinIdxT> {
};
/**
* \brief Column major matrix for gradient index. This matrix contains both dense column
* and sparse column, the type of the column is controlled by sparse threshold. When the
* number of missing values in a column is below the threshold it's classified as dense
* column.
* @brief Column major matrix for gradient index on CPU.
*
* This matrix contains both dense columns and sparse columns, the type of the column
* is controlled by the sparse threshold parameter. When the number of missing values
* in a column is below the threshold it's classified as dense column.
*/
class ColumnMatrix {
/**
* @brief A bit set for indicating whether an element in a dense column is missing.
*/
struct MissingIndicator {
using BitFieldT = LBitField32;
using T = typename BitFieldT::value_type;
BitFieldT missing;
RefResourceView<T> storage;
static_assert(std::is_same_v<T, std::uint32_t>);
template <typename U>
[[nodiscard]] std::enable_if_t<!std::is_signed_v<U>, U> static InitValue(bool init) {
return init ? ~U{0} : U{0};
}
MissingIndicator() = default;
/**
* @param n_elements Size of the bit set
* @param init Initialize the indicator to true or false.
*/
MissingIndicator(std::size_t n_elements, bool init) {
auto m_size = missing.ComputeStorageSize(n_elements);
storage = common::MakeFixedVecWithMalloc(m_size, InitValue<T>(init));
this->InitView();
}
/** @brief Set the i^th element to be a valid element (instead of missing). */
void SetValid(typename LBitField32::index_type i) { /*missing.Clear(i); */}
/** @brief assign the storage to the view. */
void InitView() {
missing = LBitField32{Span{storage.data(), storage.size()}};
}
void GrowTo(std::size_t n_elements, bool init) {
CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
<< "[Internal Error]: Cannot grow the vector when external memory is used.";
auto m_size = missing.ComputeStorageSize(n_elements);
CHECK_GE(m_size, storage.size());
if (m_size == storage.size()) {
return;
}
// grow the storage
auto resource = std::dynamic_pointer_cast<common::MallocResource>(storage.Resource());
CHECK(resource);
resource->Resize(m_size * sizeof(T), InitValue<std::byte>(init));
storage = RefResourceView<T>{resource->DataAs<T>(), m_size, resource};
this->InitView();
}
};
void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);
template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -144,9 +204,10 @@ class ColumnMatrix {
if (type_[fid] == kDenseColumn) {
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
begin[rid] = bin_id - index_base_[fid];
// not thread-safe with bool vector. FIXME(jiamingy): We can directly assign
// kMissingId to the index to avoid missing flags.
missing_flags_[feature_offsets_[fid] + rid] = false;
// not thread-safe with bit field.
// FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
// flags.
missing_.SetValid(feature_offsets_[fid] + rid);
} else {
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -156,9 +217,10 @@ class ColumnMatrix {
}
public:
using ByteType = bool;
// get number of features
bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
[[nodiscard]] bst_feature_t GetNumFeature() const {
return static_cast<bst_feature_t>(type_.size());
}
ColumnMatrix() = default;
ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -166,7 +228,7 @@ class ColumnMatrix {
}
/**
* \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
* @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
* SparsePage.
*/
void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -178,8 +240,8 @@ class ColumnMatrix {
}
/**
* \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
* data.
* @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
* data.
*
* This function requires a binary search for each bin to get back the feature index
* for those bins.
@@ -199,7 +261,7 @@ class ColumnMatrix {
}
}
bool IsInitialized() const { return !type_.empty(); }
[[nodiscard]] bool IsInitialized() const { return !type_.empty(); }
/**
* \brief Push batch of data for Quantile DMatrix support.
@@ -257,7 +319,7 @@ class ColumnMatrix {
reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
column_size};
return std::move(DenseColumnIter<BinIdxType, any_missing>{
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
}
// all columns are dense column and has no missing value
@@ -265,7 +327,8 @@ class ColumnMatrix {
template <typename RowBinIdxT>
void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
const size_t n_features, int32_t n_threads) {
missing_flags_.resize(feature_offsets_[n_features], false);
missing_.GrowTo(feature_offsets_[n_features], false);
DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -290,9 +353,15 @@ class ColumnMatrix {
void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
float missing) {
auto n_features = gmat.Features();
missing_flags_.resize(feature_offsets_[n_features], true);
auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
num_nonzeros_.resize(n_features, 0);
missing_.GrowTo(feature_offsets_[n_features], true);
auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
if (num_nonzeros_.empty()) {
num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
} else {
CHECK_EQ(num_nonzeros_.size(), n_features);
}
auto is_valid = data::IsValidFunctor{missing};
DispatchBinType(bins_type_size_, [&](auto t) {
@@ -321,8 +390,9 @@ class ColumnMatrix {
*/
void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
auto n_features = gmat.Features();
missing_flags_.resize(feature_offsets_[n_features], true);
num_nonzeros_.resize(n_features, 0);
missing_ = MissingIndicator{feature_offsets_[n_features], true};
num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
@@ -335,106 +405,35 @@ class ColumnMatrix {
});
}
BinTypeSize GetTypeSize() const { return bins_type_size_; }
auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
[[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
[[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
// And this returns part of state
bool AnyMissing() const { return any_missing_; }
[[nodiscard]] bool AnyMissing() const { return any_missing_; }
// IO procedures for external memory.
bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
fi->Read(&index_);
#if !DMLC_LITTLE_ENDIAN
// s390x
std::vector<std::underlying_type<ColumnType>::type> int_types;
fi->Read(&int_types);
type_.resize(int_types.size());
std::transform(
int_types.begin(), int_types.end(), type_.begin(),
[](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
#else
fi->Read(&type_);
#endif // !DMLC_LITTLE_ENDIAN
fi->Read(&row_ind_);
fi->Read(&feature_offsets_);
std::vector<std::uint8_t> missing;
fi->Read(&missing);
missing_flags_.resize(missing.size());
std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
[](std::uint8_t flag) { return !!flag; });
index_base_ = index_base;
#if !DMLC_LITTLE_ENDIAN
std::underlying_type<BinTypeSize>::type v;
fi->Read(&v);
bins_type_size_ = static_cast<BinTypeSize>(v);
#else
fi->Read(&bins_type_size_);
#endif
fi->Read(&any_missing_);
return true;
}
size_t Write(dmlc::Stream* fo) const {
size_t bytes{0};
auto write_vec = [&](auto const& vec) {
fo->Write(vec);
bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
sizeof(uint64_t);
};
write_vec(index_);
#if !DMLC_LITTLE_ENDIAN
// s390x
std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
return static_cast<std::underlying_type<ColumnType>::type>(t);
});
write_vec(int_types);
#else
write_vec(type_);
#endif // !DMLC_LITTLE_ENDIAN
write_vec(row_ind_);
write_vec(feature_offsets_);
// dmlc can not handle bool vector
std::vector<std::uint8_t> missing(missing_flags_.size());
std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
[](bool flag) { return static_cast<std::uint8_t>(flag); });
write_vec(missing);
#if !DMLC_LITTLE_ENDIAN
auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
fo->Write(v);
#else
fo->Write(bins_type_size_);
#endif // DMLC_LITTLE_ENDIAN
bytes += sizeof(bins_type_size_);
fo->Write(any_missing_);
bytes += sizeof(any_missing_);
return bytes;
}
[[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
[[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
[[nodiscard]] MissingIndicator const& Missing() const { return missing_; }
private:
std::vector<uint8_t> index_;
RefResourceView<std::uint8_t> index_;
std::vector<ColumnType> type_;
/* indptr of a CSC matrix. */
std::vector<size_t> row_ind_;
/* indicate where each column's index and row_ind is stored. */
std::vector<size_t> feature_offsets_;
/* The number of nnz of each column. */
std::vector<size_t> num_nonzeros_;
RefResourceView<ColumnType> type_;
/** @brief indptr of a CSC matrix. */
RefResourceView<std::size_t> row_ind_;
/** @brief indicate where each column's index and row_ind is stored. */
RefResourceView<std::size_t> feature_offsets_;
/** @brief The number of nnz of each column. */
RefResourceView<std::size_t> num_nonzeros_;
// index_base_[fid]: least bin id for feature fid
uint32_t const* index_base_;
std::vector<ByteType> missing_flags_;
std::uint32_t const* index_base_;
MissingIndicator missing_;
BinTypeSize bins_type_size_;
bool any_missing_;
};
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_COLUMN_MATRIX_H_

View File

@@ -1,16 +1,17 @@
/*!
* Copyright 2015-2019 by Contributors
* \file common.cc
* \brief Enable all kinds of global variables in common.
/**
* Copyright 2015-2023 by Contributors
*/
#include <dmlc/thread_local.h>
#include <xgboost/logging.h>
#include "common.h"
#include "./random.h"
namespace xgboost {
namespace common {
#include <dmlc/thread_local.h> // for ThreadLocalStore
#include <cstdint> // for uint8_t
#include <cstdio> // for snprintf, size_t
#include <string> // for string
#include "./random.h" // for GlobalRandomEngine, GlobalRandom
namespace xgboost::common {
/*! \brief thread local entry for random. */
struct RandomThreadLocalEntry {
/*! \brief the random engine instance. */
@@ -19,15 +20,43 @@ struct RandomThreadLocalEntry {
using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
GlobalRandomEngine& GlobalRandom() {
return RandomThreadLocalStore::Get()->engine;
GlobalRandomEngine &GlobalRandom() { return RandomThreadLocalStore::Get()->engine; }
void EscapeU8(std::string const &string, std::string *p_buffer) {
auto &buffer = *p_buffer;
for (size_t i = 0; i < string.length(); i++) {
const auto ch = string[i];
if (ch == '\\') {
if (i < string.size() && string[i + 1] == 'u') {
buffer += "\\";
} else {
buffer += "\\\\";
}
} else if (ch == '"') {
buffer += "\\\"";
} else if (ch == '\b') {
buffer += "\\b";
} else if (ch == '\f') {
buffer += "\\f";
} else if (ch == '\n') {
buffer += "\\n";
} else if (ch == '\r') {
buffer += "\\r";
} else if (ch == '\t') {
buffer += "\\t";
} else if (static_cast<uint8_t>(ch) <= 0x1f) {
// Unit separator
char buf[8];
snprintf(buf, sizeof buf, "\\u%04x", ch);
buffer += buf;
} else {
buffer += ch;
}
}
}
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
int AllVisibleGPUs() {
return 0;
}
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
int AllVisibleGPUs() { return 0; }
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -6,20 +6,19 @@
#ifndef XGBOOST_COMMON_COMMON_H_
#define XGBOOST_COMMON_COMMON_H_
#include <xgboost/base.h>
#include <xgboost/logging.h>
#include <xgboost/span.h>
#include <algorithm> // for max
#include <array> // for array
#include <cmath> // for ceil
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, int64_t
#include <sstream> // for basic_istream, operator<<, istringstream
#include <string> // for string, basic_string, getline, char_traits
#include <tuple> // for make_tuple
#include <utility> // for forward, index_sequence, make_index_sequence
#include <vector> // for vector
#include <algorithm>
#include <exception>
#include <functional>
#include <limits>
#include <numeric>
#include <sstream>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include "xgboost/base.h" // for XGBOOST_DEVICE
#include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal
#if defined(__CUDACC__)
#include <thrust/system/cuda/error.h>
@@ -74,8 +73,7 @@ inline hipError_t ThrowOnCudaError(hipError_t code, const char *file, int line)
#endif
} // namespace dh
namespace xgboost {
namespace common {
namespace xgboost::common {
/*!
* \brief Split a string by delimiter
* \param s String to be split.
@@ -91,19 +89,13 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
return ret;
}
void EscapeU8(std::string const &string, std::string *p_buffer);
template <typename T>
XGBOOST_DEVICE T Max(T a, T b) {
return a < b ? b : a;
}
// simple routine to convert any data to string
template<typename T>
inline std::string ToString(const T& data) {
std::ostringstream os;
os << data;
return os.str();
}
template <typename T1, typename T2>
XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b) {
return static_cast<T1>(std::ceil(static_cast<double>(a) / b));
@@ -217,6 +209,5 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_COMMON_H_

View File

@@ -482,7 +482,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
@@ -1178,7 +1178,13 @@ inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT
dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
}
inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
inline CUDAStreamView DefaultStream() {
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
return CUDAStreamView{cudaStreamPerThread};
#else
return CUDAStreamView{cudaStreamLegacy};
#endif
}
class CUDAStream {
cudaStream_t stream_;

70
src/common/error_msg.cc Normal file
View File

@@ -0,0 +1,70 @@
/**
* Copyright 2023 by XGBoost contributors
*/
#include "error_msg.h"
#include <mutex> // for call_once, once_flag
#include <sstream> // for stringstream
#include "../collective/communicator-inl.h" // for GetRank
#include "xgboost/context.h" // for Context
#include "xgboost/logging.h"
namespace xgboost::error {
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
std::stringstream ss;
ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
return ss.str();
}
void WarnDeprecatedGPUHist() {
auto msg =
"The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
R"(parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
)";
LOG(WARNING) << msg;
}
void WarnManualUpdater() {
static std::once_flag flag;
std::call_once(flag, [] {
LOG(WARNING)
<< "You have manually specified the `updater` parameter. The `tree_method` parameter "
"will be ignored. Incorrect sequence of updaters will produce undefined "
"behavior. For common uses, we recommend using `tree_method` parameter instead.";
});
}
void WarnDeprecatedGPUId() {
static std::once_flag flag;
std::call_once(flag, [] {
auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
msg += " E.g. device=cpu/cuda/cuda:0";
LOG(WARNING) << msg;
});
}
void WarnEmptyDataset() {
static std::once_flag flag;
std::call_once(flag,
[] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
}
void MismatchedDevices(Context const* booster, Context const* data) {
static std::once_flag flag;
std::call_once(flag, [&] {
LOG(WARNING)
<< "Falling back to prediction using DMatrix due to mismatched devices. This might "
"lead to higher memory usage and slower performance. XGBoost is running on: "
<< booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
<< R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
This warning will only be shown once.
)";
});
}
} // namespace xgboost::error

View File

@@ -6,6 +6,13 @@
#ifndef XGBOOST_COMMON_ERROR_MSG_H_
#define XGBOOST_COMMON_ERROR_MSG_H_
#include <cinttypes> // for uint64_t
#include <limits> // for numeric_limits
#include <string> // for string
#include "xgboost/base.h" // for bst_feature_t
#include "xgboost/context.h" // for Context
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // for StringView
namespace xgboost::error {
@@ -33,5 +40,62 @@ constexpr StringView InconsistentMaxBin() {
return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
"and consistent with the Booster being trained.";
}
constexpr StringView UnknownDevice() { return "Unknown device type."; }
inline void MaxFeatureSize(std::uint64_t n_features) {
auto max_n_features = std::numeric_limits<bst_feature_t>::max();
CHECK_LE(n_features, max_n_features)
<< "Unfortunately, XGBoost does not support data matrices with "
<< std::numeric_limits<bst_feature_t>::max() << " features or greater";
}
constexpr StringView InplacePredictProxy() {
return "Inplace predict accepts only DMatrixProxy as input.";
}
inline void MaxSampleSize(std::size_t n) {
LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
<< ". Consider using a different updater or tree_method.";
}
constexpr StringView OldSerialization() {
return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:
https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
for more details about differences between saving model and serializing.
)doc";
}
inline void WarnOldSerialization() {
// Display it once is enough. Otherwise this can be really verbose in distributed
// environments.
static thread_local bool logged{false};
if (logged) {
return;
}
LOG(WARNING) << OldSerialization();
logged = true;
}
void WarnDeprecatedGPUHist();
void WarnManualUpdater();
void WarnDeprecatedGPUId();
void WarnEmptyDataset();
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
constexpr StringView InvalidCUDAOrdinal() {
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
"available for using GPU.";
}
void MismatchedDevices(Context const* booster, Context const* data);
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@@ -8,12 +8,12 @@
#include <vector>
#include "../common/common.h"
#include "column_matrix.h"
#include "../data/adapter.h" // for SparsePageAdapterBatch
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "quantile.h"
#include "xgboost/base.h"
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // SparsePage, SortedCSCPage
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for SparsePage, SortedCSCPage
#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
@@ -24,15 +24,13 @@
#define PREFETCH_READ_T0(addr) do {} while (0)
#endif // defined(XGBOOST_MM_PREFETCH_PRESENT)
namespace xgboost {
namespace common {
namespace xgboost::common {
HistogramCuts::HistogramCuts() {
cut_ptrs_.HostVector().emplace_back(0);
}
HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
Span<float> const hessian) {
Span<float const> hessian) {
HistogramCuts out;
auto const &info = m->Info();
auto n_threads = ctx->Threads();
@@ -69,25 +67,14 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
return out;
}
/*!
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
#else // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
#endif // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
}
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/
void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end) {
double* pdst = reinterpret_cast<double*>(dst.data());
void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end) {
double *pdst = reinterpret_cast<double *>(dst.data());
const double *padd = reinterpret_cast<const double *>(add.data());
for (size_t i = 2 * begin; i < 2 * end; ++i) {
for (std::size_t i = 2 * begin; i < 2 * end; ++i) {
pdst[i] += padd[i];
}
}
@@ -209,18 +196,23 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
auto const *p_gpair = reinterpret_cast<const float *>(gpair.data());
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
uint32_t const *offsets = gmat.index.Offset();
// There's no feature-based compression if missing value is present.
if (kAnyMissing) {
CHECK(!offsets);
} else {
CHECK(offsets);
}
auto get_row_ptr = [&](bst_row_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return kFirstPage ? ridx : (ridx - base_rowid);
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
const size_t n_features =
get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
@@ -230,7 +222,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t i = 0; i < size; ++i) {
for (std::size_t i = 0; i < size; ++i) {
const size_t icol_start =
kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
const size_t icol_end =
@@ -248,7 +240,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
j += Prefetch::GetPrefetchStep<uint32_t>()) {
PREFETCH_READ_T0(gradient_index + j);
@@ -257,12 +249,12 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
const BinIdxType *gr_index_local = gradient_index + icol_start;
// The trick with pgh_t buffer helps the compiler to generate faster binary.
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]};
for (size_t j = 0; j < row_size; ++j) {
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
(kAnyMissing ? 0 : offsets[j]));
const uint32_t idx_bin =
two * (static_cast<uint32_t>(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j]));
auto hist_local = hist_data + idx_bin;
*(hist_local) += pgh_t[0];
*(hist_local) += pgh_t[0];
*(hist_local + 1) += pgh_t[1];
}
}
@@ -283,12 +275,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
auto get_row_ptr = [&](bst_row_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return kFirstPage ? ridx : (ridx - base_rowid);
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
@@ -350,9 +340,8 @@ void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::E
}
template <bool any_missing>
void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool force_read_by_column) const {
void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) {
/* force_read_by_column is used for testing the columnwise building of histograms.
* default force_read_by_column = false
*/
@@ -369,14 +358,13 @@ void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
});
}
template void GHistBuilder::BuildHist<true>(Span<GradientPair const> gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;
template void BuildHist<true>(Span<GradientPair const> gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column);
template void GHistBuilder::BuildHist<false>(Span<GradientPair const> gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;
} // namespace common
} // namespace xgboost
template void BuildHist<false>(Span<GradientPair const> gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column);
} // namespace xgboost::common

View File

@@ -19,16 +19,14 @@
#include <vector>
#include "categorical.h"
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh"
#include "hist_util.cuh"
#include "hist_util.h"
#include "math.h" // NOLINT
#include "quantile.h"
#include "xgboost/host_device_vector.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
constexpr float SketchContainer::kFactor;
namespace detail {
@@ -87,13 +85,13 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
return peak;
}
size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
bst_row_t num_rows, bst_feature_t columns,
size_t nnz, int device,
size_t num_cuts, bool has_weight) {
size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
bool has_weight) {
auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
// device available memory is not accurate when rmm is used.
return nnz;
return std::min(nnz, kIntMax);
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
if (sketch_batch_num_elements == 0) {
@@ -106,256 +104,279 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
sketch_batch_num_elements = std::min(num_rows * static_cast<size_t>(columns), nnz);
}
}
return sketch_batch_num_elements;
return std::min(sketch_batch_num_elements, kIntMax);
}
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries) {
void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* sorted_entries) {
// Sort both entries and wegihts.
dh::XGBDeviceAllocator<char> alloc;
CHECK_EQ(weights->size(), sorted_entries->size());
#if defined(XGBOOST_USE_CUDA)
thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(),
sorted_entries->end(), weights->begin(),
detail::EntryCompareOp());
#elif defined(XGBOOST_USE_HIP)
thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(),
sorted_entries->end(), weights->begin(),
detail::EntryCompareOp());
#endif
thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
weights->begin(), detail::EntryCompareOp());
// Scan weights
dh::XGBCachingDeviceAllocator<char> caching;
#if defined(XGBOOST_USE_CUDA)
thrust::inclusive_scan_by_key(thrust::cuda::par(caching),
sorted_entries->begin(), sorted_entries->end(),
weights->begin(), weights->begin(),
[=] __device__(const Entry& a, const Entry& b) {
return a.index == b.index;
});
thrust::inclusive_scan_by_key(
thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
weights->begin(),
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
#elif defined(XGBOOST_USE_HIP)
thrust::inclusive_scan_by_key(thrust::hip::par(caching),
sorted_entries->begin(), sorted_entries->end(),
weights->begin(), weights->begin(),
[=] __device__(const Entry& a, const Entry& b) {
return a.index == b.index;
});
thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(), sorted_entries->end(),
weights->begin(), detail::EntryCompareOp());
// Scan weights
dh::XGBCachingDeviceAllocator<char> caching;
thrust::inclusive_scan_by_key(
thrust::hip::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
weights->begin(),
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
#endif
}
void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
info.feature_types.SetDevice(device);
auto d_feature_types = info.feature_types.ConstDeviceSpan();
CHECK(!d_feature_types.empty());
auto &column_sizes_scan = *p_column_sizes_scan;
auto &sorted_entries = *p_sorted_entries;
auto& column_sizes_scan = *p_column_sizes_scan;
auto& sorted_entries = *p_sorted_entries;
// Removing duplicated entries in categorical features.
// We don't need to accumulate weight for duplicated entries as there's no weighted
// sketching for categorical features, the categories are the cut values.
dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
dh::SegmentedUnique(column_sizes_scan.data().get(),
column_sizes_scan.data().get() + column_sizes_scan.size(),
sorted_entries.begin(), sorted_entries.end(),
new_column_scan.data().get(), sorted_entries.begin(),
[=] __device__(Entry const &l, Entry const &r) {
if (l.index == r.index) {
if (IsCat(d_feature_types, l.index)) {
return l.fvalue == r.fvalue;
}
}
return false;
});
std::size_t n_uniques{0};
if (p_sorted_weights) {
using Pair = thrust::tuple<Entry, float>;
auto d_sorted_entries = dh::ToSpan(sorted_entries);
auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
n_uniques = dh::SegmentedUnique(
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
[=] __device__(Pair const& l, Pair const& r) {
Entry const& le = thrust::get<0>(l);
Entry const& re = thrust::get<0>(r);
if (le.index == re.index && IsCat(d_feature_types, le.index)) {
return le.fvalue == re.fvalue;
}
return false;
});
p_sorted_weights->resize(n_uniques);
} else {
n_uniques = dh::SegmentedUnique(
column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
if (l.index == r.index) {
if (IsCat(d_feature_types, l.index)) {
return l.fvalue == r.fvalue;
}
}
return false;
});
}
sorted_entries.resize(n_uniques);
// Renew the column scan and cut scan based on categorical data.
auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
info.num_col_ + 1);
dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
dh::LaunchN(
new_column_scan.size(),
[=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
if (idx == d_new_columns_ptr.size() - 1) {
return;
}
if (IsCat(d_feature_types, idx)) {
// Cut size is the same as number of categories in input.
d_new_cuts_size[idx] =
d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
} else {
d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
}
});
dh::LaunchN(new_column_scan.size(),
[=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
if (idx == d_new_columns_ptr.size() - 1) {
return;
}
if (IsCat(d_feature_types, idx)) {
// Cut size is the same as number of categories in input.
d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
} else {
d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
}
});
// Turn size into ptr.
thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
new_cuts_size.cend(), d_cuts_ptr.data());
thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
d_cuts_ptr.data());
}
} // namespace detail
void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
size_t begin, size_t end, SketchContainer *sketch_container,
int num_cuts_per_feature, size_t num_columns) {
dh::XGBCachingDeviceAllocator<char> alloc;
void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo const& info,
std::size_t begin, std::size_t end,
SketchContainer* sketch_container, // <- output sketch
int num_cuts_per_feature, common::Span<float const> sample_weight) {
dh::device_vector<Entry> sorted_entries;
if (page.data.DeviceCanRead()) {
const auto& device_data = page.data.ConstDevicePointer();
sorted_entries = dh::device_vector<Entry>(device_data + begin, device_data + end);
// direct copy if data is already on device
auto const& d_data = page.data.ConstDevicePointer();
sorted_entries = dh::device_vector<Entry>(d_data + begin, d_data + end);
} else {
const auto& host_data = page.data.ConstHostVector();
sorted_entries = dh::device_vector<Entry>(host_data.begin() + begin,
host_data.begin() + end);
const auto& h_data = page.data.ConstHostVector();
sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
}
#if defined(XGBOOST_USE_CUDA)
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp());
#elif defined(XGBOOST_USE_HIP)
thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp());
#endif
bst_row_t base_rowid = page.base_rowid;
dh::device_vector<float> entry_weight;
auto cuctx = ctx->CUDACtx();
if (!sample_weight.empty()) {
// Expand sample weight into entry weight.
CHECK_EQ(sample_weight.size(), info.num_row_);
entry_weight.resize(sorted_entries.size());
auto d_temp_weight = dh::ToSpan(entry_weight);
page.offset.SetDevice(ctx->Device());
auto row_ptrs = page.offset.ConstDeviceSpan();
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), entry_weight.size(),
[=] __device__(std::size_t idx) {
std::size_t element_idx = idx + begin;
std::size_t ridx = dh::SegmentId(row_ptrs, element_idx);
d_temp_weight[idx] = sample_weight[ridx + base_rowid];
});
detail::SortByWeight(&entry_weight, &sorted_entries);
} else {
thrust::sort(cuctx->CTP(), sorted_entries.begin(), sorted_entries.end(),
detail::EntryCompareOp());
}
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
dh::caching_device_vector<size_t> column_sizes_scan;
data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
sorted_entries.data().get(),
[] __device__(Entry const &e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scanning column size.
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
&column_sizes_scan);
}
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());
// add cuts into sketches
sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan),
d_cuts_ptr, h_cuts_ptr.back());
// Add cuts into sketches
sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
h_cuts_ptr.back(), dh::ToSpan(entry_weight));
sorted_entries.clear();
sorted_entries.shrink_to_fit();
CHECK_EQ(sorted_entries.capacity(), 0);
CHECK_NE(cuts_ptr.Size(), 0);
}
void ProcessWeightedBatch(int device, const SparsePage& page,
MetaInfo const& info, size_t begin, size_t end,
SketchContainer* sketch_container, int num_cuts_per_feature,
size_t num_columns,
bool is_ranking, Span<bst_group_t const> d_group_ptr) {
auto weights = info.weights_.ConstDeviceSpan();
// Unify group weight, Hessian, and sample weight into sample weight.
[[nodiscard]] Span<float const> UnifyWeight(CUDAContext const* cuctx, MetaInfo const& info,
common::Span<float const> hessian,
HostDeviceVector<float>* p_out_weight) {
if (hessian.empty()) {
if (info.IsRanking() && !info.weights_.Empty()) {
common::Span<float const> group_weight = info.weights_.ConstDeviceSpan();
dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
auto d_group_ptr = dh::ToSpan(group_ptr);
CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
auto d_weight = info.weights_.ConstDeviceSpan();
CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
<< "Weight size should equal to number of groups.";
p_out_weight->Resize(info.num_row_);
auto d_weight_out = p_out_weight->DeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc;
const auto& host_data = page.data.ConstHostVector();
dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
host_data.begin() + end);
// Binary search to assign weights to each element
dh::device_vector<float> temp_weights(sorted_entries.size());
auto d_temp_weights = temp_weights.data().get();
page.offset.SetDevice(device);
auto row_ptrs = page.offset.ConstDeviceSpan();
size_t base_rowid = page.base_rowid;
if (is_ranking) {
CHECK_GE(d_group_ptr.size(), 2)
<< "Must have at least 1 group for ranking.";
CHECK_EQ(weights.size(), d_group_ptr.size() - 1)
<< "Weight size should equal to number of groups.";
dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
size_t element_idx = idx + begin;
size_t ridx = dh::SegmentId(row_ptrs, element_idx);
bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx + base_rowid);
d_temp_weights[idx] = weights[group_idx];
});
} else {
dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
size_t element_idx = idx + begin;
size_t ridx = dh::SegmentId(row_ptrs, element_idx);
d_temp_weights[idx] = weights[ridx + base_rowid];
});
}
detail::SortByWeight(&temp_weights, &sorted_entries);
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
dh::caching_device_vector<size_t> column_sizes_scan;
data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
sorted_entries.data().get(),
[] __device__(Entry const &e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
}
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
// Extract cuts
sketch_container->Push(dh::ToSpan(sorted_entries),
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
h_cuts_ptr.back(), dh::ToSpan(temp_weights));
sorted_entries.clear();
sorted_entries.shrink_to_fit();
}
HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements) {
dmat->Info().feature_types.SetDevice(device);
dmat->Info().feature_types.ConstDevicePointer(); // pull to device early
// Configure batch size based on available memory
bool has_weights = dmat->Info().weights_.Size() > 0;
size_t num_cuts_per_feature =
detail::RequiredSampleCutsPerColumn(max_bins, dmat->Info().num_row_);
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
dmat->Info().num_row_,
dmat->Info().num_col_,
dmat->Info().num_nonzero_,
device, num_cuts_per_feature, has_weights);
HistogramCuts cuts;
SketchContainer sketch_container(dmat->Info().feature_types, max_bins, dmat->Info().num_col_,
dmat->Info().num_row_, device);
dmat->Info().weights_.SetDevice(device);
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
size_t batch_nnz = batch.data.Size();
auto const& info = dmat->Info();
for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
if (has_weights) {
bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
info.group_ptr_.cend());
ProcessWeightedBatch(
device, batch, dmat->Info(), begin, end,
&sketch_container,
num_cuts_per_feature,
dmat->Info().num_col_,
is_ranking, dh::ToSpan(groups));
} else {
ProcessBatch(device, dmat->Info(), batch, begin, end, &sketch_container,
num_cuts_per_feature, dmat->Info().num_col_);
}
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), d_weight_out.size(),
[=] XGBOOST_DEVICE(std::size_t i) {
auto gidx = dh::SegmentId(d_group_ptr, i);
d_weight_out[i] = d_weight[gidx];
});
return p_out_weight->ConstDeviceSpan();
} else {
return info.weights_.ConstDeviceSpan();
}
}
sketch_container.MakeCuts(&cuts);
// sketch with hessian as weight
p_out_weight->Resize(info.num_row_);
auto d_weight_out = p_out_weight->DeviceSpan();
if (!info.weights_.Empty()) {
// merge sample weight with hessian
auto d_weight = info.weights_.ConstDeviceSpan();
if (info.IsRanking()) {
dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
CHECK_EQ(hessian.size(), d_weight_out.size());
auto d_group_ptr = dh::ToSpan(group_ptr);
CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
<< "Weight size should equal to number of groups.";
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
[=] XGBOOST_DEVICE(std::size_t i) {
d_weight_out[i] = d_weight[dh::SegmentId(d_group_ptr, i)] * hessian(i);
});
} else {
CHECK_EQ(hessian.size(), info.num_row_);
CHECK_EQ(hessian.size(), d_weight.size());
CHECK_EQ(hessian.size(), d_weight_out.size());
thrust::for_each_n(
cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
[=] XGBOOST_DEVICE(std::size_t i) { d_weight_out[i] = d_weight[i] * hessian(i); });
}
} else {
// copy hessian as weight
CHECK_EQ(d_weight_out.size(), hessian.size());
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
cudaMemcpyDefault));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
hipMemcpyDefault));
#endif
}
return d_weight_out;
}
HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
Span<float const> hessian,
std::size_t sketch_batch_num_elements) {
auto const& info = p_fmat->Info();
bool has_weight = !info.weights_.Empty();
info.feature_types.SetDevice(ctx->Device());
HostDeviceVector<float> weight;
weight.SetDevice(ctx->Device());
// Configure batch size based on available memory
std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_);
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements, info.num_row_, info.num_col_, info.num_nonzero_, ctx->Ordinal(),
num_cuts_per_feature, has_weight);
CUDAContext const* cuctx = ctx->CUDACtx();
info.weights_.SetDevice(ctx->Device());
auto d_weight = UnifyWeight(cuctx, info, hessian, &weight);
HistogramCuts cuts;
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
ctx->Ordinal());
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
std::size_t page_nnz = page.data.Size();
for (auto begin = 0ull; begin < page_nnz; begin += sketch_batch_num_elements) {
std::size_t end =
std::min(page_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
ProcessWeightedBatch(ctx, page, info, begin, end, &sketch_container, num_cuts_per_feature,
d_weight);
}
}
sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
return cuts;
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -11,18 +11,17 @@
#include <cstddef> // for size_t
#include "../data/device_adapter.cuh"
#include "../data/adapter.h" // for IsValidFunctor
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
#include "timer.h"
#include "xgboost/span.h" // for IterSpan
#if defined(XGBOOST_USE_HIP)
namespace cub = hipcub;
#endif
namespace xgboost {
namespace common {
namespace xgboost::common {
namespace cuda {
/**
* copy and paste of the host version, we can't make it a __host__ __device__ function as
@@ -148,12 +147,12 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
CHECK(!force_use_u64);
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
kernel, batch_iter, is_valid, out_column_size);
} else {
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
kernel, batch_iter, is_valid, out_column_size);
}
} else {
@@ -262,16 +261,41 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan);
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
} // namespace detail
// Compute sketch on DMatrix.
// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 0);
/**
* @brief Compute sketch on DMatrix with GPU and Hessian as weight.
*
* @param ctx Runtime context
* @param p_fmat Training feature matrix
* @param max_bin Maximum number of bins for each feature
* @param hessian Hessian vector.
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
*
* @return Quantile cuts
*/
HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
Span<float const> hessian,
std::size_t sketch_batch_num_elements = 0);
/**
* @brief Compute sketch on DMatrix with GPU.
*
* @param ctx Runtime context
* @param p_fmat Training feature matrix
* @param max_bin Maximum number of bins for each feature
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
*
* @return Quantile cuts
*/
inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
std::size_t sketch_batch_num_elements = 0) {
return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
}
template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
@@ -303,8 +327,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
&column_sizes_scan);
}
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -408,8 +432,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
&column_sizes_scan);
}
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -471,7 +495,5 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
}
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // COMMON_HIST_UTIL_CUH_

View File

@@ -16,11 +16,9 @@
#include <vector>
#include "categorical.h"
#include "common.h"
#include "quantile.h"
#include "row_set.h"
#include "threading_utils.h"
#include "timer.h"
#include "xgboost/base.h" // for bst_feature_t, bst_bin_t
#include "xgboost/data.h"
@@ -84,7 +82,7 @@ class HistogramCuts {
return *this;
}
uint32_t FeatureBins(bst_feature_t feature) const {
[[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
}
@@ -92,8 +90,8 @@ class HistogramCuts {
std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
bool HasCategorical() const { return has_categorical_; }
float MaxCategory() const { return max_cat_; }
[[nodiscard]] bool HasCategorical() const { return has_categorical_; }
[[nodiscard]] float MaxCategory() const { return max_cat_; }
/**
* \brief Set meta info about categorical features.
*
@@ -105,12 +103,13 @@ class HistogramCuts {
max_cat_ = max_cat;
}
size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
[[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
// Return the index of a cut point that is strictly greater than the input
// value, or the last available index if none exists
bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
std::vector<float> const& values) const {
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
std::vector<uint32_t> const& ptrs,
std::vector<float> const& values) const {
auto end = ptrs[column_id + 1];
auto beg = ptrs[column_id];
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
@@ -119,20 +118,20 @@ class HistogramCuts {
return idx;
}
bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
[[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
return this->SearchBin(value, column_id, Ptrs(), Values());
}
/**
* \brief Search the bin index for numerical feature.
*/
bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
[[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
/**
* \brief Search the bin index for categorical feature.
*/
bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
std::vector<float> const& vals) const {
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
std::vector<uint32_t> const& ptrs,
std::vector<float> const& vals) const {
auto end = ptrs.at(fidx + 1) + vals.cbegin();
auto beg = ptrs[fidx] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
@@ -143,12 +142,14 @@ class HistogramCuts {
}
return bin_idx;
}
bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
[[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
auto const& ptrs = this->Ptrs();
auto const& vals = this->Values();
return this->SearchCatBin(value, fidx, ptrs, vals);
}
bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }
[[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
return SearchCatBin(e.fvalue, e.index);
}
/**
* \brief Return numerical bin value given bin index.
@@ -171,7 +172,7 @@ class HistogramCuts {
* but consumes more memory.
*/
HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
bool use_sorted = false, Span<float> const hessian = {});
bool use_sorted = false, Span<float const> hessian = {});
enum BinTypeSize : uint8_t {
kUint8BinsTypeSize = 1,
@@ -200,13 +201,33 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
}
/**
* \brief Optionally compressed gradient index. The compression works only with dense
* @brief Optionally compressed gradient index. The compression works only with dense
* data.
*
* The main body of construction code is in gradient_index.cc, this struct is only a
* storage class.
* view class.
*/
struct Index {
class Index {
private:
void SetBinTypeSize(BinTypeSize binTypeSize) {
binTypeSize_ = binTypeSize;
switch (binTypeSize) {
case kUint8BinsTypeSize:
func_ = &GetValueFromUint8;
break;
case kUint16BinsTypeSize:
func_ = &GetValueFromUint16;
break;
case kUint32BinsTypeSize:
func_ = &GetValueFromUint32;
break;
default:
CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
binTypeSize == kUint32BinsTypeSize);
}
}
public:
// Inside the compressor, bin_idx is the index for cut value across all features. By
// subtracting it with starting pointer of each feature, we can reduce it to smaller
// value and store it with smaller types. Usable only with dense data.
@@ -230,10 +251,24 @@ struct Index {
}
Index() { SetBinTypeSize(binTypeSize_); }
Index(const Index& i) = delete;
Index& operator=(Index i) = delete;
Index(Index const& i) = delete;
Index& operator=(Index const& i) = delete;
Index(Index&& i) = delete;
Index& operator=(Index&& i) = delete;
/** @brief Move assignment for lazy initialization. */
Index& operator=(Index&& i) = default;
/**
* @brief Construct the index from data.
*
* @param data Storage for compressed histogram bin.
* @param bin_size Number of bytes for each bin.
*/
Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
this->SetBinTypeSize(bin_size);
}
uint32_t operator[](size_t i) const {
if (!bin_offset_.empty()) {
// dense, compressed
@@ -244,26 +279,7 @@ struct Index {
return func_(data_.data(), i);
}
}
void SetBinTypeSize(BinTypeSize binTypeSize) {
binTypeSize_ = binTypeSize;
switch (binTypeSize) {
case kUint8BinsTypeSize:
func_ = &GetValueFromUint8;
break;
case kUint16BinsTypeSize:
func_ = &GetValueFromUint16;
break;
case kUint32BinsTypeSize:
func_ = &GetValueFromUint32;
break;
default:
CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
binTypeSize == kUint32BinsTypeSize);
}
}
BinTypeSize GetBinTypeSize() const {
return binTypeSize_;
}
[[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
template <typename T>
T const* data() const { // NOLINT
return reinterpret_cast<T const*>(data_.data());
@@ -272,30 +288,27 @@ struct Index {
T* data() { // NOLINT
return reinterpret_cast<T*>(data_.data());
}
uint32_t const* Offset() const { return bin_offset_.data(); }
size_t OffsetSize() const { return bin_offset_.size(); }
size_t Size() const { return data_.size() / (binTypeSize_); }
[[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
[[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
[[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }
void Resize(const size_t n_bytes) {
data_.resize(n_bytes);
}
// set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
bin_offset_.resize(cut_ptrs.size() - 1); // resize to number of features.
std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
}
std::vector<uint8_t>::const_iterator begin() const { // NOLINT
return data_.begin();
auto begin() const { // NOLINT
return data_.data();
}
std::vector<uint8_t>::const_iterator end() const { // NOLINT
return data_.end();
auto end() const { // NOLINT
return data_.data() + data_.size();
}
std::vector<uint8_t>::iterator begin() { // NOLINT
return data_.begin();
auto begin() { // NOLINT
return data_.data();
}
std::vector<uint8_t>::iterator end() { // NOLINT
return data_.end();
auto end() { // NOLINT
return data_.data() + data_.size();
}
private:
@@ -310,12 +323,12 @@ struct Index {
using Func = uint32_t (*)(uint8_t const*, size_t);
std::vector<uint8_t> data_;
Span<std::uint8_t> data_;
// starting position of each feature inside the cut values (the indptr of the CSC cut matrix
// HistogramCuts without the last entry.) Used for bin compression.
std::vector<uint32_t> bin_offset_;
BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
BinTypeSize binTypeSize_{kUint8BinsTypeSize};
Func func_;
};
@@ -349,16 +362,12 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
}
using GHistRow = Span<xgboost::GradientPairPrecise>;
/*!
* \brief fill a histogram by zeros
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
/*!
* \brief Increment hist as dst += add in range [begin, end)
*/
void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end);
void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end);
/*!
* \brief Copy hist from src to dst in range [begin, end)
@@ -381,12 +390,7 @@ class HistCollection {
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
const size_t id = row_ptr_.at(nid);
CHECK_NE(id, kMax);
GradientPairPrecise* ptr = nullptr;
if (contiguous_allocation_) {
ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
} else {
ptr = const_cast<GradientPairPrecise*>(data_[id].data());
}
GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
return {ptr, nbins_};
}
@@ -431,23 +435,12 @@ class HistCollection {
data_[row_ptr_[nid]].resize(nbins_, {0, 0});
}
}
// allocate common buffer contiguously for all nodes, need for single Allreduce call
void AllocateAllData() {
const size_t new_size = nbins_*data_.size();
contiguous_allocation_ = true;
if (data_[0].size() != new_size) {
data_[0].resize(new_size);
}
}
private:
/*! \brief number of all bins over all features */
uint32_t nbins_ = 0;
/*! \brief amount of active nodes in hist collection */
uint32_t n_nodes_added_ = 0;
/*! \brief flag to identify contiguous memory allocation */
bool contiguous_allocation_ = false;
std::vector<std::vector<GradientPairPrecise>> data_;
/*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -503,7 +496,7 @@ class ParallelGHistBuilder {
GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
if (!hist_was_used_[tid * nodes_ + nid]) {
InitilizeHistByZeroes(hist, 0, hist.size());
std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
}
@@ -533,7 +526,7 @@ class ParallelGHistBuilder {
if (!is_updated) {
// In distributed mode - some tree nodes can be empty on local machines,
// So we need just set local hist by zeros in this case
InitilizeHistByZeroes(dst, begin, end);
std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
}
}
@@ -583,6 +576,8 @@ class ParallelGHistBuilder {
}
}
[[nodiscard]] bst_bin_t TotalBins() const { return nbins_; }
private:
void MatchNodeNidPairToHist() {
size_t hist_allocated_additionally = 0;
@@ -628,27 +623,10 @@ class ParallelGHistBuilder {
std::map<std::pair<size_t, size_t>, int> tid_nid_to_hist_;
};
/*!
* \brief builder for histograms of gradient statistics
*/
class GHistBuilder {
public:
GHistBuilder() = default;
explicit GHistBuilder(uint32_t nbins): nbins_{nbins} {}
// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow hist,
bool force_read_by_column = false) const;
uint32_t GetNumBins() const {
return nbins_;
}
private:
/*! \brief number of all bins over all features */
uint32_t nbins_ { 0 };
};
// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow hist, bool force_read_by_column = false);
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_HIST_UTIL_H_

View File

@@ -168,6 +168,9 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
template <typename T>
void HostDeviceVector<T>::SetDevice(int) const {}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<double>;

View File

@@ -434,6 +434,11 @@ void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
impl_->SetDevice(device.ordinal);
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->Resize(new_size, v);

View File

@@ -1,24 +1,48 @@
/*!
* Copyright (c) by XGBoost Contributors 2019-2022
/**
* Copyright 2019-2023, by XGBoost Contributors
*/
#if defined(__unix__)
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)
#if !defined(xgboost_IS_WIN)
#if defined(_MSC_VER) || defined(__MINGW32__)
#define xgboost_IS_WIN 1
#endif // defined(_MSC_VER) || defined(__MINGW32__)
#endif // !defined(xgboost_IS_WIN)
#if defined(__unix__) || defined(__APPLE__)
#include <fcntl.h> // for open, O_RDONLY
#include <sys/mman.h> // for mmap, mmap64, munmap
#include <unistd.h> // for close, getpagesize
#elif defined(xgboost_IS_WIN)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif // defined(__unix__)
#include <algorithm>
#include <fstream>
#include <string>
#include <memory>
#include <utility>
#include <cstdio>
#include "xgboost/logging.h"
#include <algorithm> // for copy, transform
#include <cctype> // for tolower
#include <cerrno> // for errno
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, uint32_t
#include <cstring> // for memcpy
#include <filesystem> // for filesystem, weakly_canonical
#include <fstream> // for ifstream
#include <iterator> // for distance
#include <limits> // for numeric_limits
#include <memory> // for unique_ptr
#include <string> // for string
#include <system_error> // for error_code, system_category
#include <utility> // for move
#include <vector> // for vector
#include "io.h"
#include "xgboost/collective/socket.h" // for LastError
#include "xgboost/logging.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
size_t PeekableInStream::Read(void* dptr, size_t size) {
size_t nbuffer = buffer_.length() - buffer_ptr_;
if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -94,52 +118,50 @@ void FixedSizeStream::Take(std::string* out) {
*out = std::move(buffer_);
}
std::string LoadSequentialFile(std::string uri, bool stream) {
namespace {
// Get system alignment value for IO with mmap.
std::size_t GetMmapAlignment() {
#if defined(xgboost_IS_WIN)
SYSTEM_INFO sys_info;
GetSystemInfo(&sys_info);
// During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
// size 65536.
return sys_info.dwAllocationGranularity;
#else
return getpagesize();
#endif
}
auto SystemErrorMsg() {
std::int32_t errsv = system::LastError();
auto err = std::error_code{errsv, std::system_category()};
return err.message();
}
} // anonymous namespace
std::vector<char> LoadSequentialFile(std::string uri) {
auto OpenErr = [&uri]() {
std::string msg;
msg = "Opening " + uri + " failed: ";
msg += strerror(errno);
msg += SystemErrorMsg();
LOG(FATAL) << msg;
};
auto parsed = dmlc::io::URI(uri.c_str());
CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0))
<< "Only local file is supported.";
// Read from file.
if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) {
std::string buffer;
// Open in binary mode so that correct file size can be computed with
// seekg(). This accommodates Windows platform:
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in);
if (!ifs) {
// https://stackoverflow.com/a/17338934
OpenErr();
}
ifs.seekg(0, std::ios_base::end);
const size_t file_size = static_cast<size_t>(ifs.tellg());
ifs.seekg(0, std::ios_base::beg);
buffer.resize(file_size + 1);
ifs.read(&buffer[0], file_size);
buffer.back() = '\0';
return buffer;
auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
if (!ifs) {
// https://stackoverflow.com/a/17338934
OpenErr();
}
// Read from remote.
std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")};
std::string buffer;
size_t constexpr kInitialSize = 4096;
size_t size {kInitialSize}, total {0};
while (true) {
buffer.resize(total + size);
size_t read = fs->Read(&buffer[total], size);
total += read;
if (read < size) {
break;
}
size *= 2;
}
buffer.resize(total);
auto file_size = std::filesystem::file_size(path);
std::vector<char> buffer(file_size);
ifs.read(&buffer[0], file_size);
return buffer;
}
@@ -155,5 +177,159 @@ std::string FileExtension(std::string fname, bool lower) {
return "";
}
}
} // namespace common
} // namespace xgboost
// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
ResourceHandler::~ResourceHandler() noexcept(false) {} // NOLINT
struct MMAPFile {
#if defined(xgboost_IS_WIN)
HANDLE fd{INVALID_HANDLE_VALUE};
HANDLE file_map{INVALID_HANDLE_VALUE};
#else
std::int32_t fd{0};
#endif
std::byte* base_ptr{nullptr};
std::size_t base_size{0};
std::size_t delta{0};
std::string path;
MMAPFile() = default;
#if defined(xgboost_IS_WIN)
MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
std::string path)
: fd{fd},
file_map{fm},
base_ptr{base_ptr},
base_size{base_size},
delta{delta},
path{std::move(path)} {}
#else
MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
std::string path)
: fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
#endif
};
std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
if (length == 0) {
return std::make_unique<MMAPFile>();
}
#if defined(xgboost_IS_WIN)
HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
#else
auto fd = open(path.c_str(), O_RDONLY);
CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
#endif
std::byte* ptr{nullptr};
// Round down for alignment.
auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
auto view_size = length + (offset - view_start);
#if defined(__linux__) || defined(__GLIBC__)
int prot{PROT_READ};
ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
madvise(ptr, view_size, MADV_WILLNEED);
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
auto handle =
std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
#elif defined(xgboost_IS_WIN)
auto file_size = GetFileSize(fd, nullptr);
DWORD access = PAGE_READONLY;
auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
access = FILE_MAP_READ;
std::uint32_t loff = static_cast<std::uint32_t>(view_start);
std::uint32_t hoff = view_start >> 32;
CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
std::move(path));
#else
CHECK_LE(offset, std::numeric_limits<off_t>::max())
<< "File size has exceeded the limit on the current system.";
int prot{PROT_READ};
ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
auto handle =
std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
#endif // defined(__linux__)
return handle;
}
MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
: ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
MmapResource::~MmapResource() noexcept(false) {
if (!handle_) {
return;
}
#if defined(xgboost_IS_WIN)
if (handle_->base_ptr) {
CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
}
if (handle_->fd != INVALID_HANDLE_VALUE) {
CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
}
if (handle_->file_map != INVALID_HANDLE_VALUE) {
CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
}
#else
if (handle_->base_ptr) {
CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
<< "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
}
if (handle_->fd != 0) {
CHECK_NE(close(handle_->fd), -1)
<< "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
}
#endif
}
[[nodiscard]] void* MmapResource::Data() {
if (!handle_) {
return nullptr;
}
return handle_->base_ptr + handle_->delta;
}
[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {} // NOLINT
PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {} // NOLINT
AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
: pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
std::size_t n_bytes) noexcept(true) {
pimpl_->Write(ptr, n_bytes);
return n_bytes;
}
AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
: pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
AlignedMemWriteStream::~AlignedMemWriteStream() = default;
[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
std::size_t n_bytes) noexcept(true) {
this->pimpl_->Write(ptr, n_bytes);
return n_bytes;
}
[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
return this->pimpl_->Tell();
}
} // namespace xgboost::common
#if defined(xgboost_IS_WIN)
#undef xgboost_IS_WIN
#endif // defined(xgboost_IS_WIN)

View File

@@ -1,23 +1,32 @@
/*!
* Copyright by XGBoost Contributors 2014-2022
/**
* Copyright 2014-2023, XGBoost Contributors
* \file io.h
* \brief general stream interface for serialization, I/O
* \author Tianqi Chen
*/
#ifndef XGBOOST_COMMON_IO_H_
#define XGBOOST_COMMON_IO_H_
#include <dmlc/io.h>
#include <rabit/rabit.h>
#include <string>
#include <cstring>
#include <fstream>
#include <algorithm> // for min, fill_n, copy_n
#include <array> // for array
#include <cstddef> // for byte, size_t
#include <cstdlib> // for malloc, realloc, free
#include <cstring> // for memcpy
#include <fstream> // for ifstream
#include <limits> // for numeric_limits
#include <memory> // for unique_ptr
#include <string> // for string
#include <type_traits> // for alignment_of_v, enable_if_t
#include <utility> // for move
#include <vector> // for vector
#include "common.h"
#include "xgboost/string_view.h" // for StringView
namespace xgboost {
namespace common {
namespace xgboost::common {
using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
using MemoryBufferStream = rabit::utils::MemoryBufferStream;
@@ -56,8 +65,8 @@ class FixedSizeStream : public PeekableInStream {
size_t Read(void* dptr, size_t size) override;
size_t PeekRead(void* dptr, size_t size) override;
size_t Size() const { return buffer_.size(); }
size_t Tell() const { return pointer_; }
[[nodiscard]] std::size_t Size() const { return buffer_.size(); }
[[nodiscard]] std::size_t Tell() const { return pointer_; }
void Seek(size_t pos);
void Write(const void*, size_t) override {
@@ -75,16 +84,14 @@ class FixedSizeStream : public PeekableInStream {
std::string buffer_;
};
/*!
* \brief Helper function for loading consecutive file to avoid dmlc Stream when possible.
/**
* @brief Helper function for loading consecutive file.
*
* \param uri URI or file name to file.
* \param stream Use dmlc Stream unconditionally if set to true. Used for running test
* without remote filesystem.
* @param uri URI or file name to file.
*
* \return File content.
* @return File content.
*/
std::string LoadSequentialFile(std::string uri, bool stream = false);
std::vector<char> LoadSequentialFile(std::string uri);
/**
* \brief Get file extension from file name.
@@ -127,6 +134,318 @@ inline std::string ReadAll(std::string const &path) {
return content;
}
} // namespace common
} // namespace xgboost
struct MMAPFile;
/**
* @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
* fixed once it's constructed. Users cannot use mutable operations like resize
* without acquiring the specific resource first.
*/
class ResourceHandler {
public:
// RTTI
enum Kind : std::uint8_t {
kMalloc = 0,
kMmap = 1,
};
private:
Kind kind_{kMalloc};
public:
virtual void* Data() = 0;
template <typename T>
[[nodiscard]] T* DataAs() {
return reinterpret_cast<T*>(this->Data());
}
[[nodiscard]] virtual std::size_t Size() const = 0;
[[nodiscard]] auto Type() const { return kind_; }
// Allow exceptions for cleaning up resource.
virtual ~ResourceHandler() noexcept(false);
explicit ResourceHandler(Kind kind) : kind_{kind} {}
// Use shared_ptr to manage a pool like resource handler. All copy and assignment
// operators are disabled.
ResourceHandler(ResourceHandler const& that) = delete;
ResourceHandler& operator=(ResourceHandler const& that) = delete;
ResourceHandler(ResourceHandler&& that) = delete;
ResourceHandler& operator=(ResourceHandler&& that) = delete;
/**
* @brief Wether two resources have the same type. (both malloc or both mmap).
*/
[[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
return this->Type() == that.Type();
}
};
class MallocResource : public ResourceHandler {
void* ptr_{nullptr};
std::size_t n_{0};
void Clear() noexcept(true) {
std::free(ptr_);
ptr_ = nullptr;
n_ = 0;
}
public:
explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
~MallocResource() noexcept(true) override { this->Clear(); }
void* Data() override { return ptr_; }
[[nodiscard]] std::size_t Size() const override { return n_; }
/**
* @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
* over malloc.
*
* @tparam force_malloc Force the use of malloc over realloc. Used for testing.
*
* @param n_bytes The new size.
*/
template <bool force_malloc = false>
void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
// realloc(ptr, 0) works, but is deprecated.
if (n_bytes == 0) {
this->Clear();
return;
}
// If realloc fails, we need to copy the data ourselves.
bool need_copy{false};
void* new_ptr{nullptr};
// use realloc first, it can handle nullptr.
if constexpr (!force_malloc) {
new_ptr = std::realloc(ptr_, n_bytes);
}
// retry with malloc if realloc fails
if (!new_ptr) {
// ptr_ is preserved if realloc fails
new_ptr = std::malloc(n_bytes);
need_copy = true;
}
if (!new_ptr) {
// malloc fails
LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
}
if (need_copy) {
std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
}
// default initialize
std::fill_n(reinterpret_cast<std::byte*>(new_ptr) + n_, n_bytes - n_, init);
// free the old ptr if malloc is used.
if (need_copy) {
this->Clear();
}
ptr_ = new_ptr;
n_ = n_bytes;
}
};
/**
* @brief A class for wrapping mmap as a resource for RAII.
*/
class MmapResource : public ResourceHandler {
std::unique_ptr<MMAPFile> handle_;
std::size_t n_;
public:
MmapResource(std::string path, std::size_t offset, std::size_t length);
~MmapResource() noexcept(false) override;
[[nodiscard]] void* Data() override;
[[nodiscard]] std::size_t Size() const override;
};
/**
* @param Alignment for resource read stream and aligned write stream.
*/
constexpr std::size_t IOAlignment() {
// For most of the pod types in XGBoost, 8 byte is sufficient.
return 8;
}
/**
* @brief Wrap resource into a dmlc stream.
*
* This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
* method or the `Consume()` method. The former copies data into output, while the latter
* makes copy only if it's a primitive type.
*
* Input is required to be aligned to IOAlignment().
*/
class AlignedResourceReadStream {
std::shared_ptr<ResourceHandler> resource_;
std::size_t curr_ptr_{0};
// Similar to SEEK_END in libc
static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
public:
explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
: resource_{std::move(resource)} {}
[[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
/**
* @brief Consume n_bytes of data, no copying is performed.
*
* @return A pair with the beginning pointer and the number of available bytes, which
* may be smaller than requested.
*/
[[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
auto res_size = resource_->Size();
auto data = reinterpret_cast<std::byte*>(resource_->Data());
auto ptr = data + curr_ptr_;
// Move the cursor
auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
curr_ptr_ += aligned_forward;
return std::pair{ptr, forward};
}
template <typename T>
[[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
auto [ptr, size] = this->Consume(sizeof(T));
if (size != sizeof(T)) {
return false;
}
CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
*out = *reinterpret_cast<T*>(ptr);
return true;
}
[[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
/**
* @brief Read n_bytes of data, output is copied into ptr.
*/
[[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
auto [res_ptr, forward] = this->Consume(n_bytes);
if (forward != 0) {
std::memcpy(ptr, res_ptr, forward);
}
return forward;
}
/**
* @brief Read a primitive type.
*
* @return Whether the read is successful.
*/
template <typename T>
[[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
return this->Consume(out);
}
/**
* @brief Read a vector.
*
* @return Whether the read is successful.
*/
template <typename T>
[[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
std::uint64_t n{0};
if (!this->Consume(&n)) {
return false;
}
out->resize(n);
auto n_bytes = sizeof(T) * n;
if (this->Read(out->data(), n_bytes) != n_bytes) {
return false;
}
return true;
}
virtual ~AlignedResourceReadStream() noexcept(false);
};
/**
* @brief Private mmap file as a read-only stream.
*
* It can calculate alignment automatically based on system page size (or allocation
* granularity on Windows).
*
* The file is required to be aligned by IOAlignment().
*/
class PrivateMmapConstStream : public AlignedResourceReadStream {
public:
/**
* @brief Construct a private mmap stream.
*
* @param path File path.
* @param offset See the `offset` parameter of `mmap` for details.
* @param length See the `length` parameter of `mmap` for details.
*/
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
~PrivateMmapConstStream() noexcept(false) override;
};
/**
* @brief Base class for write stream with alignment defined by IOAlignment().
*/
class AlignedWriteStream {
protected:
[[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
std::size_t n_bytes) noexcept(true) = 0;
public:
virtual ~AlignedWriteStream() = default;
[[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
auto w_n_bytes = this->DoWrite(ptr, n_bytes);
CHECK_EQ(w_n_bytes, n_bytes);
auto remaining = aligned_n_bytes - n_bytes;
if (remaining > 0) {
std::array<std::uint8_t, IOAlignment()> padding;
std::memset(padding.data(), '\0', padding.size());
w_n_bytes = this->DoWrite(padding.data(), remaining);
CHECK_EQ(w_n_bytes, remaining);
}
return aligned_n_bytes;
}
template <typename T>
[[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
return this->Write(&v, sizeof(T));
}
};
/**
* @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
*/
class AlignedFileWriteStream : public AlignedWriteStream {
std::unique_ptr<dmlc::Stream> pimpl_;
protected:
[[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
public:
AlignedFileWriteStream() = default;
AlignedFileWriteStream(StringView path, StringView flags);
~AlignedFileWriteStream() override = default;
};
/**
* @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
*/
class AlignedMemWriteStream : public AlignedFileWriteStream {
std::unique_ptr<MemoryBufferStream> pimpl_;
protected:
[[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
public:
explicit AlignedMemWriteStream(std::string* p_buf);
~AlignedMemWriteStream() override;
[[nodiscard]] std::size_t Tell() const noexcept(true);
};
} // namespace xgboost::common
#endif // XGBOOST_COMMON_IO_H_

View File

@@ -1,23 +1,29 @@
/*!
* Copyright (c) by Contributors 2019-2022
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#include "xgboost/json.h"
#include <dmlc/endian.h>
#include <array> // for array
#include <cctype> // for isdigit
#include <cmath> // for isinf, isnan
#include <cstdio> // for EOF
#include <cstdlib> // for size_t, strtof
#include <cstring> // for memcpy
#include <initializer_list> // for initializer_list
#include <iterator> // for distance
#include <limits> // for numeric_limits
#include <memory> // for allocator
#include <sstream> // for operator<<, basic_ostream, operator&, ios, stringstream
#include <system_error> // for errc
#include <cctype>
#include <cmath>
#include <cstddef>
#include <iterator>
#include <limits>
#include <sstream>
#include "./math.h"
#include "charconv.h"
#include "xgboost/base.h"
#include "xgboost/json_io.h"
#include "xgboost/logging.h"
#include "xgboost/string_view.h"
#include "./math.h" // for CheckNAN
#include "charconv.h" // for to_chars, NumericLimits, from_chars, to_chars_result
#include "common.h" // for EscapeU8
#include "xgboost/base.h" // for XGBOOST_EXPECT
#include "xgboost/intrusive_ptr.h" // for IntrusivePtr
#include "xgboost/json_io.h" // for JsonReader, UBJReader, UBJWriter, JsonWriter, ToBigEn...
#include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal, LogCheck_NE, CHECK
#include "xgboost/string_view.h" // for StringView, operator<<
namespace xgboost {
@@ -57,12 +63,12 @@ void JsonWriter::Visit(JsonObject const* obj) {
}
void JsonWriter::Visit(JsonNumber const* num) {
char number[NumericLimits<float>::kToCharsSize];
auto res = to_chars(number, number + sizeof(number), num->GetNumber());
std::array<char, NumericLimits<float>::kToCharsSize> number;
auto res = to_chars(number.data(), number.data() + number.size(), num->GetNumber());
auto end = res.ptr;
auto ori_size = stream_->size();
stream_->resize(stream_->size() + end - number);
std::memcpy(stream_->data() + ori_size, number, end - number);
stream_->resize(stream_->size() + end - number.data());
std::memcpy(stream_->data() + ori_size, number.data(), end - number.data());
}
void JsonWriter::Visit(JsonInteger const* num) {
@@ -88,43 +94,15 @@ void JsonWriter::Visit(JsonNull const* ) {
}
void JsonWriter::Visit(JsonString const* str) {
std::string buffer;
buffer += '"';
auto const& string = str->GetString();
for (size_t i = 0; i < string.length(); i++) {
const char ch = string[i];
if (ch == '\\') {
if (i < string.size() && string[i+1] == 'u') {
buffer += "\\";
} else {
buffer += "\\\\";
}
} else if (ch == '"') {
buffer += "\\\"";
} else if (ch == '\b') {
buffer += "\\b";
} else if (ch == '\f') {
buffer += "\\f";
} else if (ch == '\n') {
buffer += "\\n";
} else if (ch == '\r') {
buffer += "\\r";
} else if (ch == '\t') {
buffer += "\\t";
} else if (static_cast<uint8_t>(ch) <= 0x1f) {
// Unit separator
char buf[8];
snprintf(buf, sizeof buf, "\\u%04x", ch);
buffer += buf;
} else {
buffer += ch;
}
}
buffer += '"';
std::string buffer;
buffer += '"';
auto const& string = str->GetString();
common::EscapeU8(string, &buffer);
buffer += '"';
auto s = stream_->size();
stream_->resize(s + buffer.size());
std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
auto s = stream_->size();
stream_->resize(s + buffer.size());
std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
}
void JsonWriter::Visit(JsonBoolean const* boolean) {

View File

@@ -12,18 +12,17 @@
namespace xgboost {
namespace linalg {
template <typename T, int32_t D, typename Fn>
#if defined(XGBOOST_USE_HIP)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#elif defined(XGBOOST_USE_CUDA)
#if defined(XGBOOST_USE_CUDA)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
#elif defined(XGBOOST_USE_HIP)
void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
#endif
{
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(t.DeviceIdx()));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(t.Device().ordinal));
#endif
static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
"For function with return, use transform instead.");
if (t.Contiguous()) {

View File

@@ -134,12 +134,6 @@ inline float LogSum(Iterator begin, Iterator end) {
return mx + std::log(sum);
}
// comparator functions for sorting pairs in descending order
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {
return a.first > b.first;
}
// Redefined here to workaround a VC bug that doesn't support overloading for integer
// types.
template <typename T>

View File

@@ -10,6 +10,7 @@
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <iterator> // for iterator_traits
#include <numeric> // for accumulate
#include <vector>
#include "common.h" // AssertGPUSupport

View File

@@ -587,14 +587,14 @@ void SketchContainer::FixError() {
});
}
void SketchContainer::AllReduce() {
void SketchContainer::AllReduce(bool is_column_split) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#endif
auto world = collective::GetWorldSize();
if (world == 1) {
if (world == 1 || is_column_split) {
return;
}
@@ -672,7 +672,7 @@ struct InvalidCatOp {
};
} // anonymous namespace
void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
@@ -682,7 +682,7 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers.
this->AllReduce();
this->AllReduce(is_column_split);
// Prune to final number of bins.
this->Prune(num_bins_ + 1);

View File

@@ -154,9 +154,9 @@ class SketchContainer {
Span<SketchEntry const> that);
/* \brief Merge quantiles from other GPU workers. */
void AllReduce();
void AllReduce(bool is_column_split);
/* \brief Create the final histogram cut values. */
void MakeCuts(HistogramCuts* cuts);
void MakeCuts(HistogramCuts* cuts, bool is_column_split);
Span<SketchEntry const> Data() const {
return {this->Current().data().get(), this->Current().size()};

View File

@@ -139,7 +139,7 @@ struct WeightOp {
void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
group_ptr_.SetDevice(ctx->gpu_id);
group_ptr_.SetDevice(ctx->Device());
if (info.group_ptr_.empty()) {
group_ptr_.Resize(2, 0);
group_ptr_.HostVector()[1] = info.num_row_;
@@ -164,7 +164,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
max_group_size_ =
thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
threads_group_ptr_.SetDevice(ctx->gpu_id);
threads_group_ptr_.SetDevice(ctx->Device());
threads_group_ptr_.Resize(n_groups + 1, 0);
auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
if (param_.HasTruncation()) {
@@ -179,7 +179,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
n_cuda_threads_ = info.num_row_ * param_.NumPair();
}
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.SetDevice(ctx->Device());
sorted_idx_cache_.Resize(info.labels.Size(), 0);
auto weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -198,18 +198,18 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
CUDAContext const* cuctx = ctx->CUDACtx();
auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto labels = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
auto d_group_ptr = this->DataGroupPtr(ctx);
std::size_t n_groups = d_group_ptr.size() - 1;
inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
auto d_inv_idcg = inv_idcg_.View(ctx->Device());
cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
CHECK_GE(this->Param().NumPair(), 1ul);
discounts_.SetDevice(ctx->gpu_id);
discounts_.SetDevice(ctx->Device());
discounts_.Resize(MaxGroupSize());
auto d_discount = discounts_.DeviceSpan();
dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
@@ -217,12 +217,12 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
}
void PreCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
CheckPreLabels("pre", d_label, CheckMAPOp{ctx->CUDACtx()});
}
void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
CheckPreLabels("map", d_label, CheckMAPOp{ctx->CUDACtx()});
}
} // namespace xgboost::ltr

View File

@@ -12,7 +12,7 @@
#include <vector> // for vector
#include "dmlc/parameter.h" // for FieldEntry, DMLC_DECLARE_FIELD
#include "error_msg.h" // for GroupWeight, GroupSize
#include "error_msg.h" // for GroupWeight, GroupSize, InvalidCUDAOrdinal
#include "xgboost/base.h" // for XGBOOST_DEVICE, bst_group_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
@@ -217,7 +217,7 @@ class RankingCache {
}
// Constructed as [1, n_samples] if group ptr is not supplied by the user
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
group_ptr_.SetDevice(ctx->gpu_id);
group_ptr_.SetDevice(ctx->Device());
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
}
@@ -228,7 +228,7 @@ class RankingCache {
// Create a rank list by model prediction
common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
if (sorted_idx_cache_.Empty()) {
sorted_idx_cache_.SetDevice(ctx->gpu_id);
sorted_idx_cache_.SetDevice(ctx->Device());
sorted_idx_cache_.Resize(predt.size());
}
if (ctx->IsCPU()) {
@@ -240,17 +240,17 @@ class RankingCache {
// The function simply returns a uninitialized buffer as this is only used by the
// objective for creating pairs.
common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
if (y_sorted_idx_cache_.Empty()) {
y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
y_sorted_idx_cache_.SetDevice(ctx->Device());
y_sorted_idx_cache_.Resize(n_samples);
}
return y_sorted_idx_cache_.DeviceSpan();
}
common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
CHECK(ctx->IsCUDA());
CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
if (y_ranked_by_model_.Empty()) {
y_ranked_by_model_.SetDevice(ctx->gpu_id);
y_ranked_by_model_.SetDevice(ctx->Device());
y_ranked_by_model_.Resize(n_samples);
}
return y_ranked_by_model_.DeviceSpan();
@@ -266,21 +266,21 @@ class RankingCache {
linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
if (roundings_.Size() == 0) {
roundings_.SetDevice(ctx->gpu_id);
roundings_.SetDevice(ctx->Device());
roundings_.Reshape(Groups());
}
return roundings_.View(ctx->gpu_id);
return roundings_.View(ctx->Device());
}
common::Span<double> CUDACostRounding(Context const* ctx) {
if (cost_rounding_.Size() == 0) {
cost_rounding_.SetDevice(ctx->gpu_id);
cost_rounding_.SetDevice(ctx->Device());
cost_rounding_.Resize(1);
}
return cost_rounding_.DeviceSpan();
}
template <typename Type>
common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
max_lambdas_.SetDevice(ctx->gpu_id);
max_lambdas_.SetDevice(ctx->Device());
std::size_t bytes = n * sizeof(Type);
if (bytes != max_lambdas_.Size()) {
max_lambdas_.Resize(bytes);
@@ -315,17 +315,17 @@ class NDCGCache : public RankingCache {
}
linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
return inv_idcg_.View(ctx->gpu_id);
return inv_idcg_.View(ctx->Device());
}
common::Span<double const> Discount(Context const* ctx) const {
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
}
linalg::VectorView<double> Dcg(Context const* ctx) {
if (dcg_.Size() == 0) {
dcg_.SetDevice(ctx->gpu_id);
dcg_.SetDevice(ctx->Device());
dcg_.Reshape(this->Groups());
}
return dcg_.View(ctx->gpu_id);
return dcg_.View(ctx->Device());
}
};
@@ -396,7 +396,7 @@ class PreCache : public RankingCache {
common::Span<double> Pre(Context const* ctx) {
if (pre_.Empty()) {
pre_.SetDevice(ctx->gpu_id);
pre_.SetDevice(ctx->Device());
pre_.Resize(this->Groups());
}
return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
@@ -427,21 +427,21 @@ class MAPCache : public RankingCache {
common::Span<double> NumRelevant(Context const* ctx) {
if (n_rel_.Empty()) {
n_rel_.SetDevice(ctx->gpu_id);
n_rel_.SetDevice(ctx->Device());
n_rel_.Resize(n_samples_);
}
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
}
common::Span<double> Acc(Context const* ctx) {
if (acc_.Empty()) {
acc_.SetDevice(ctx->gpu_id);
acc_.SetDevice(ctx->Device());
acc_.Resize(n_samples_);
}
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
}
common::Span<double> Map(Context const* ctx) {
if (map_.Empty()) {
map_.SetDevice(ctx->gpu_id);
map_.SetDevice(ctx->Device());
map_.Resize(this->Groups());
}
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();

View File

@@ -0,0 +1,193 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
#include <algorithm> // for fill_n
#include <cstdint> // for uint64_t
#include <cstring> // for memcpy
#include <memory> // for shared_ptr, make_shared
#include <type_traits> // for is_reference_v, remove_reference_t, is_same_v
#include <utility> // for swap, move
#include "io.h" // for ResourceHandler, AlignedResourceReadStream, MallocResource
#include "xgboost/logging.h"
#include "xgboost/span.h" // for Span
namespace xgboost::common {
/**
* @brief A vector-like type that holds a reference counted resource.
*
* The vector size is immutable after construction. This way we can swap the underlying
* resource when needed.
*/
template <typename T>
class RefResourceView {
static_assert(!std::is_reference_v<T>);
public:
using value_type = T; // NOLINT
using size_type = std::uint64_t; // NOLINT
private:
value_type* ptr_{nullptr};
size_type size_{0};
std::shared_ptr<common::ResourceHandler> mem_{nullptr};
protected:
void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
ptr_ = ptr;
size_ = size;
mem_ = std::move(mem);
}
public:
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
: ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
CHECK_GE(mem_->Size(), n);
}
/**
* @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
*
* @param ptr The pointer to view.
* @param n The length of the view.
* @param mem The owner of the pointer.
* @param init Initialize the view with this value.
*/
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
T const& init)
: RefResourceView{ptr, n, mem} {
if (n != 0) {
std::fill_n(ptr_, n, init);
}
}
~RefResourceView() = default;
RefResourceView() = default;
RefResourceView(RefResourceView const& that) = delete;
RefResourceView& operator=(RefResourceView const& that) = delete;
/**
* @brief We allow move assignment for lazy initialization.
*/
RefResourceView(RefResourceView&& that) = default;
RefResourceView& operator=(RefResourceView&& that) = default;
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
return Span{data(), size()}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
[[nodiscard]] bool empty() const { return size() == 0; } // NOLINT
[[nodiscard]] auto cbegin() const { return data(); } // NOLINT
[[nodiscard]] auto begin() { return data(); } // NOLINT
[[nodiscard]] auto begin() const { return cbegin(); } // NOLINT
[[nodiscard]] auto cend() const { return data() + size(); } // NOLINT
[[nodiscard]] auto end() { return data() + size(); } // NOLINT
[[nodiscard]] auto end() const { return cend(); } // NOLINT
[[nodiscard]] auto const& front() const { return data()[0]; } // NOLINT
[[nodiscard]] auto& front() { return data()[0]; } // NOLINT
[[nodiscard]] auto const& back() const { return data()[size() - 1]; } // NOLINT
[[nodiscard]] auto& back() { return data()[size() - 1]; } // NOLINT
[[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
[[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
/**
* @brief Get the underlying resource.
*/
auto Resource() const { return mem_; }
};
/**
* @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
*
* If the output vector is a referenced counted view, no copying occur.
*/
template <typename Vec>
[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
std::uint64_t n{0};
if (!fi->Read(&n)) {
return false;
}
if (n == 0) {
return true;
}
using T = typename Vec::value_type;
auto expected_bytes = sizeof(T) * n;
auto [ptr, n_bytes] = fi->Consume(expected_bytes);
if (n_bytes != expected_bytes) {
return false;
}
if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
*vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
} else {
vec->resize(n);
std::memcpy(vec->data(), ptr, n_bytes);
}
return true;
}
/**
* @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
*/
template <typename Vec>
[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
std::size_t bytes{0};
auto n = static_cast<std::uint64_t>(vec.size());
bytes += fo->Write(n);
if (n == 0) {
return sizeof(n);
}
using T = typename std::remove_reference_t<decltype(vec)>::value_type;
bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
return bytes;
}
/**
* @brief Make a fixed size `RefResourceView` with malloc resource.
*/
template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
}
template <typename T>
class ReallocVector : public RefResourceView<T> {
static_assert(!std::is_reference_v<T>);
static_assert(!std::is_const_v<T>);
static_assert(std::is_trivially_copyable_v<T>);
using Upper = RefResourceView<T>;
using size_type = typename Upper::size_type; // NOLINT
using value_type = typename Upper::value_type; // NOLINT
public:
ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}
ReallocVector(size_type n, value_type const& init)
: RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
ReallocVector(ReallocVector const& that) = delete;
ReallocVector(ReallocVector&& that) = delete;
ReallocVector& operator=(ReallocVector const& that) = delete;
ReallocVector& operator=(ReallocVector&& that) = delete;
void Resize(typename Upper::size_type new_size) {
auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
CHECK(resource);
resource->Resize(new_size * sizeof(T));
this->Init(resource->template DataAs<T>(), new_size, resource);
}
};
} // namespace xgboost::common
#endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_

View File

@@ -20,9 +20,9 @@ namespace common {
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
if (!ctx->IsCPU()) {
weights.SetDevice(ctx->gpu_id);
weights.SetDevice(ctx->Device());
auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
auto t_v = t.View(ctx->gpu_id);
auto t_v = t.View(ctx->Device());
cuda_impl::Median(ctx, t_v, opt_weights, out);
}
@@ -59,7 +59,7 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
out->HostView()(0) = ret;
} else {
cuda_impl::Mean(ctx, v.View(ctx->gpu_id), out->View(ctx->gpu_id));
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
}
}
} // namespace common

View File

@@ -7,13 +7,14 @@
#include <dmlc/common.h>
#include <dmlc/omp.h>
#include <algorithm>
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <limits>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <cstdlib> // for malloc, free
#include <functional> // for function
#include <new> // for bad_alloc
#include <type_traits> // for is_signed
#include <vector>
#include <type_traits> // for is_signed, conditional_t, is_integral_v, invoke_result_t
#include <vector> // for vector
#include "xgboost/logging.h"
@@ -25,14 +26,14 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; } // NOLINT
// MSVC doesn't implement the thread limit.
#if defined(_OPENMP) && defined(_MSC_VER)
#include <limits>
extern "C" {
inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); } // NOLINT
}
#endif // defined(_MSC_VER)
namespace xgboost {
namespace common {
namespace xgboost::common {
// Represent simple range of indexes [begin, end)
// Inspired by tbb::blocked_range
class Range1d {
@@ -69,7 +70,7 @@ class Range1d {
// [1,2], [3,4], [5,6], [7,8], [9]
// The class helps to process data in several tree nodes (non-balanced usually) in parallel
// Using nested parallelism (by nodes and by data in each node)
// it helps to improve CPU resources utilization
// it helps to improve CPU resources utilization
class BlockedSpace2d {
public:
// Example of space:
@@ -86,63 +87,72 @@ class BlockedSpace2d {
// dim1 - size of the first dimension in the space
// getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
// grain_size - max size of produced blocks
template<typename Func>
BlockedSpace2d(size_t dim1, Func getter_size_dim2, size_t grain_size) {
for (size_t i = 0; i < dim1; ++i) {
const size_t size = getter_size_dim2(i);
const size_t n_blocks = size/grain_size + !!(size % grain_size);
for (size_t iblock = 0; iblock < n_blocks; ++iblock) {
const size_t begin = iblock * grain_size;
const size_t end = std::min(begin + grain_size, size);
template <typename Getter>
BlockedSpace2d(std::size_t dim1, Getter&& getter_size_dim2, std::size_t grain_size) {
static_assert(std::is_integral_v<std::invoke_result_t<Getter, std::size_t>>);
for (std::size_t i = 0; i < dim1; ++i) {
std::size_t size = getter_size_dim2(i);
// Each row (second dim) is divided into n_blocks
std::size_t n_blocks = size / grain_size + !!(size % grain_size);
for (std::size_t iblock = 0; iblock < n_blocks; ++iblock) {
std::size_t begin = iblock * grain_size;
std::size_t end = std::min(begin + grain_size, size);
AddBlock(i, begin, end);
}
}
}
// Amount of blocks(tasks) in a space
size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return ranges_.size();
}
// get index of the first dimension of i-th block(task)
size_t GetFirstDimension(size_t i) const {
[[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
CHECK_LT(i, first_dimension_.size());
return first_dimension_[i];
}
// get a range of indexes for the second dimension of i-th block(task)
Range1d GetRange(size_t i) const {
[[nodiscard]] Range1d GetRange(std::size_t i) const {
CHECK_LT(i, ranges_.size());
return ranges_[i];
}
private:
void AddBlock(size_t first_dimension, size_t begin, size_t end) {
first_dimension_.push_back(first_dimension);
/**
* @brief Add a parallel block.
*
* @param first_dim The row index.
* @param begin The begin of the second dimension.
* @param end The end of the second dimension.
*/
void AddBlock(std::size_t first_dim, std::size_t begin, std::size_t end) {
first_dimension_.push_back(first_dim);
ranges_.emplace_back(begin, end);
}
std::vector<Range1d> ranges_;
std::vector<size_t> first_dimension_;
std::vector<std::size_t> first_dimension_;
};
// Wrapper to implement nested parallelism with simple omp parallel for
template <typename Func>
void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
const size_t num_blocks_in_space = space.Size();
CHECK_GE(nthreads, 1);
void ParallelFor2d(const BlockedSpace2d& space, int n_threads, Func&& func) {
static_assert(std::is_void_v<std::invoke_result_t<Func, std::size_t, Range1d>>);
std::size_t n_blocks_in_space = space.Size();
CHECK_GE(n_threads, 1);
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthreads)
#pragma omp parallel num_threads(n_threads)
{
exc.Run([&]() {
size_t tid = omp_get_thread_num();
size_t chunck_size =
num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
std::size_t tid = omp_get_thread_num();
std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);
size_t begin = chunck_size * tid;
size_t end = std::min(begin + chunck_size, num_blocks_in_space);
std::size_t begin = chunck_size * tid;
std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);
for (auto i = begin; i < end; i++) {
func(space.GetFirstDimension(i), space.GetRange(i));
}
@@ -303,7 +313,6 @@ class MemStackAllocator {
* \brief Constant that can be used for initializing static thread local memory.
*/
std::int32_t constexpr DefaultMaxThreads() { return 128; }
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_THREADING_UTILS_H_

View File

@@ -3,53 +3,201 @@
*
* \brief Context object used for controlling runtime parameters.
*/
#include <xgboost/context.h>
#include "xgboost/context.h"
#include "common/common.h" // AssertGPUSupport
#include <algorithm> // for find_if
#include <charconv> // for from_chars
#include <iterator> // for distance
#include <optional> // for optional
#include <regex> // for regex_replace, regex_match
#include "common/common.h" // AssertGPUSupport
#include "common/error_msg.h" // WarnDeprecatedGPUId
#include "common/threading_utils.h"
#include "xgboost/string_view.h"
namespace xgboost {
DMLC_REGISTER_PARAMETER(Context);
std::int32_t constexpr Context::kCpuId;
bst_d_ordinal_t constexpr Context::kCpuId;
std::int64_t constexpr Context::kDefaultSeed;
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
void Context::ConfigureGpuId(bool require_gpu) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
if (gpu_id == kCpuId) { // 0. User didn't specify the `gpu_id'
if (require_gpu) { // 1. `tree_method' or `predictor' or both are using
// GPU.
// 2. Use device 0 as default.
this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
}
}
namespace {
inline constexpr char const* kDevice = "device";
// 3. When booster is loaded from a memory image (Python pickle or R
// raw model), number of available GPUs could be different. Wrap around it.
int32_t n_gpus = common::AllVisibleGPUs();
if (n_gpus == 0) {
if (gpu_id != kCpuId) {
LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
}
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
} else if (fail_on_invalid_gpu_id) {
CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
<< "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
} else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
<< gpu_id % n_gpus;
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
}
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
device = DeviceOrd::CPU();
return device;
}
#else
// Just set it to CPU, don't think about it.
this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
(void)(require_gpu);
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
// Check CUDA on the current device, wrap the ordinal if necessary.
[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
// When booster is loaded from a memory image (Python pickle or R raw model), number of
// available GPUs could be different. Wrap around it.
std::int32_t n_visible = common::AllVisibleGPUs();
if (n_visible == 0) {
if (device.IsCUDA()) {
LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
}
device = DeviceOrd::CPU();
} else if (fail_on_invalid) {
CHECK(device.IsCPU() || device.ordinal < n_visible)
<< "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
<< " is invalid.";
} else if (device.IsCUDA() && device.ordinal >= n_visible) {
device.ordinal = device.ordinal % n_visible;
LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
<< device.ordinal;
}
common::SetDevice(this->gpu_id);
if (device.IsCUDA()) {
common::SetDevice(device.ordinal);
}
return device;
}
#endif // !defined(XGBOOST_USE_CUDA)
[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
// Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
// letting go of unknown characters.
if (ordinal.empty()) {
return std::nullopt;
}
std::size_t offset{0};
if (ordinal[0] == '-') {
offset = 1;
}
if (ordinal.size() <= offset) {
return std::nullopt;
}
bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
[](auto c) { return std::isdigit(c); });
if (!valid) {
return std::nullopt;
}
std::int32_t parsed_id{Context::kCpuId};
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
if (res.ec != std::errc()) {
return std::nullopt;
}
return parsed_id;
}
[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
- cpu
- cuda
- cuda:<device ordinal> # e.g. cuda:0
- gpu
- gpu:<device ordinal> # e.g. gpu:0
)"};
auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
#if defined(__MINGW32__)
// mingw hangs on regex using rtools 430. Basic checks only.
CHECK_GE(input.size(), 3) << msg;
auto substr = input.substr(0, 3);
bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
CHECK(valid) << msg;
#else
std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
if (!std::regex_match(input, pattern)) {
fatal();
}
#endif // defined(__MINGW32__)
// handle alias
std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
DeviceOrd device;
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check.
if (split_it == s_device.cend()) {
// no ordinal.
if (s_device == DeviceSym::CPU()) {
device = DeviceOrd::CPU();
} else if (s_device == DeviceSym::CUDA()) {
device = DeviceOrd::CUDA(0); // use 0 as default;
} else {
fatal();
}
} else {
// must be CUDA when ordinal is specifed.
// +1 for colon
std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
// substr
StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
if (s_ordinal.empty()) {
fatal();
}
auto opt_id = ParseInt(s_ordinal);
if (!opt_id.has_value()) {
fatal();
}
CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
<< "Ordinal value too large.";
device = DeviceOrd::CUDA(opt_id.value());
}
if (device.ordinal < Context::kCpuId) {
fatal();
}
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
return device;
}
} // namespace
void Context::ConfigureGpuId(bool require_gpu) {
if (this->IsCPU() && require_gpu) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
}
}
void Context::SetDeviceOrdinal(Args const& kwargs) {
auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
[](auto const& p) { return p.first == "gpu_id"; });
auto has_gpu_id = gpu_id_it != kwargs.cend();
auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
[](auto const& p) { return p.first == kDevice; });
auto has_device = device_it != kwargs.cend();
if (has_device && has_gpu_id) {
LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
}
if (has_gpu_id) {
// Compatible with XGBoost < 2.0.0
error::WarnDeprecatedGPUId();
auto opt_id = ParseInt(StringView{gpu_id_it->second});
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
if (opt_id.value() > Context::kCpuId) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
} else {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
}
return;
}
auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
if (!has_device) {
CHECK_EQ(new_d.ordinal, this->device_.ordinal); // unchanged
}
this->SetDevice(new_d);
if (this->IsCPU()) {
CHECK_EQ(this->device_.ordinal, kCpuId);
} else {
CHECK_GT(this->device_.ordinal, kCpuId);
}
}
std::int32_t Context::Threads() const {

View File

@@ -7,7 +7,7 @@
#include <dmlc/data.h>
#include <algorithm>
#include <cstddef> // std::size_t
#include <cstddef> // for size_t
#include <functional>
#include <limits>
#include <map>
@@ -17,6 +17,7 @@
#include <vector>
#include "../c_api/c_api_error.h"
#include "../common/error_msg.h" // for MaxFeatureSize
#include "../common/math.h"
#include "array_interface.h"
#include "arrow-cdi.h"
@@ -300,9 +301,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
array_interface_ = ArrayInterface<2>(get<Object const>(j));
batch_ = ArrayAdapterBatch{array_interface_};
}
ArrayAdapterBatch const& Value() const override { return batch_; }
size_t NumRows() const { return array_interface_.Shape(0); }
size_t NumColumns() const { return array_interface_.Shape(1); }
[[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
private:
ArrayAdapterBatch batch_;
@@ -476,7 +477,6 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
ArrayInterface<1> indptr_;
ArrayInterface<1> indices_;
ArrayInterface<1> values_;
bst_row_t n_rows_;
class Line {
std::size_t column_idx_;
@@ -502,11 +502,8 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
static constexpr bool kIsRowMajor = false;
CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
ArrayInterface<1> values, bst_row_t n_rows)
: indptr_{std::move(indptr)},
indices_{std::move(indices)},
values_{std::move(values)},
n_rows_{n_rows} {}
ArrayInterface<1> values)
: indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}
std::size_t Size() const { return indptr_.n - 1; }
Line GetLine(std::size_t idx) const {
@@ -541,8 +538,7 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
indices_{indices},
values_{values},
num_rows_{num_rows},
batch_{
CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
batch_{CSCArrayAdapterBatch{indptr_, indices_, values_}} {}
// JVM package sends 0 as unknown
size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }

View File

@@ -386,7 +386,7 @@ inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
* numpy has the proper support even though it's in the __cuda_array_interface__
* protocol defined by numba.
*/
template <int32_t D, bool allow_mask = (D == 1)>
template <std::int32_t D, bool allow_mask = (D == 1)>
class ArrayInterface {
static_assert(D > 0, "Invalid dimension for array interface.");
@@ -457,7 +457,7 @@ class ArrayInterface {
explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}
void AssignType(StringView typestr) {
using T = ArrayInterfaceHandler::Type;
@@ -590,9 +590,9 @@ class ArrayInterface {
};
template <std::int32_t D, typename Fn>
void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
// Only used for cuDF at the moment.
CHECK_EQ(array.valid.Size(), 0);
CHECK_EQ(array.valid.Capacity(), 0);
auto dispatch = [&](auto t) {
using T = std::remove_const_t<decltype(t)> const;
// Set the data size to max as we don't know the original size of a sliced array:

View File

@@ -4,42 +4,57 @@
*/
#include "xgboost/data.h"
#include <dmlc/registry.h>
#include <dmlc/registry.h> // for DMLC_REGISTRY_ENABLE, DMLC_REGISTRY_LINK_TAG
#include <array>
#include <cstddef>
#include <cstring>
#include <algorithm> // for copy, max, none_of, min
#include <atomic> // for atomic
#include <cmath> // for abs
#include <cstdint> // for uint64_t, int32_t, uint8_t, uint32_t
#include <cstring> // for size_t, strcmp, memcpy
#include <exception> // for exception
#include <iostream> // for operator<<, basic_ostream, basic_ostream::op...
#include <map> // for map, operator!=
#include <numeric> // for accumulate, partial_sum
#include <tuple> // for get, apply
#include <type_traits> // for remove_pointer_t, remove_reference
#include "../collective/communicator-inl.h"
#include "../collective/communicator.h"
#include "../common/algorithm.h" // for StableSort
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/common.h"
#include "../common/error_msg.h" // for InfInData, GroupWeight, GroupSize
#include "../common/group_data.h"
#include "../common/io.h"
#include "../common/linalg_op.h"
#include "../common/math.h"
#include "../common/numeric.h" // for Iota
#include "../common/threading_utils.h"
#include "../common/version.h"
#include "../data/adapter.h"
#include "../data/iterative_dmatrix.h"
#include "./sparse_page_dmatrix.h"
#include "./sparse_page_source.h"
#include "dmlc/io.h"
#include "file_iterator.h"
#include "simple_dmatrix.h"
#include "sparse_page_writer.h"
#include "validation.h"
#include "xgboost/c_api.h"
#include "xgboost/context.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/learner.h"
#include "xgboost/linalg.h" // Vector
#include "xgboost/logging.h"
#include "xgboost/string_view.h"
#include "xgboost/version_config.h"
#include "../collective/communicator-inl.h" // for GetRank, GetWorldSize, Allreduce, IsFederated
#include "../collective/communicator.h" // for Operation
#include "../common/algorithm.h" // for StableSort
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/common.h" // for Split
#include "../common/error_msg.h" // for GroupSize, GroupWeight, InfInData
#include "../common/group_data.h" // for ParallelGroupBuilder
#include "../common/io.h" // for PeekableInStream
#include "../common/linalg_op.h" // for ElementWiseTransformHost
#include "../common/math.h" // for CheckNAN
#include "../common/numeric.h" // for Iota, RunLengthEncode
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/version.h" // for Version
#include "../data/adapter.h" // for COOTuple, FileAdapter, IsValidFunctor
#include "../data/iterative_dmatrix.h" // for IterativeDMatrix
#include "./sparse_page_dmatrix.h" // for SparsePageDMatrix
#include "array_interface.h" // for ArrayInterfaceHandler, ArrayInterface, Dispa...
#include "dmlc/base.h" // for BeginPtr
#include "dmlc/common.h" // for OMPException
#include "dmlc/data.h" // for Parser
#include "dmlc/endian.h" // for ByteSwap, DMLC_IO_NO_ENDIAN_SWAP
#include "dmlc/io.h" // for Stream
#include "dmlc/thread_local.h" // for ThreadLocalStore
#include "ellpack_page.h" // for EllpackPage
#include "file_iterator.h" // for ValidateFileFormat, FileIterator, Next, Reset
#include "gradient_index.h" // for GHistIndexMatrix
#include "simple_dmatrix.h" // for SimpleDMatrix
#include "sparse_page_writer.h" // for SparsePageFormatReg
#include "validation.h" // for LabelsCheck, WeightsCheck, ValidateQueryGroup
#include "xgboost/base.h" // for bst_group_t, bst_row_t, bst_float, bst_ulong
#include "xgboost/context.h" // for Context
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/learner.h" // for HostDeviceVector
#include "xgboost/linalg.h" // for Tensor, Stack, TensorView, Vector, ArrayInte...
#include "xgboost/logging.h" // for Error, LogCheck_EQ, CHECK, CHECK_EQ, LOG
#include "xgboost/span.h" // for Span, operator!=, SpanIterator
#include "xgboost/string_view.h" // for operator==, operator<<, StringView
namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
@@ -351,7 +366,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
// Groups is maintained by a higher level Python function. We should aim at deprecating
// the slice function.
if (this->labels.Size() != this->num_row_) {
auto t_labels = this->labels.View(this->labels.Data()->DeviceIdx());
auto t_labels = this->labels.View(this->labels.Data()->Device());
out.labels.Reshape(ridxs.size(), labels.Shape(1));
out.labels.Data()->HostVector() =
Gather(this->labels.Data()->HostVector(), ridxs, t_labels.Stride(0));
@@ -379,7 +394,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
if (this->base_margin_.Size() != this->num_row_) {
CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0)
<< "Incorrect size of base margin vector.";
auto t_margin = this->base_margin_.View(this->base_margin_.Data()->DeviceIdx());
auto t_margin = this->base_margin_.View(this->base_margin_.Data()->Device());
out.base_margin_.Reshape(ridxs.size(), t_margin.Shape(1));
out.base_margin_.Data()->HostVector() =
Gather(this->base_margin_.Data()->HostVector(), ridxs, t_margin.Stride(0));
@@ -416,7 +431,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
p_out->Reshape(array.shape);
return;
}
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value.";
if (array.is_contiguous && array.type == ToDType<T>::kType) {
// Handle contigious
p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
@@ -429,10 +445,10 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
return;
}
p_out->Reshape(array.shape);
auto t_out = p_out->View(Context::kCpuId);
auto t_out = p_out->View(DeviceOrd::CPU());
CHECK(t_out.CContiguous());
auto const shape = t_out.Shape();
DispatchDType(array, Context::kCpuId, [&](auto&& in) {
DispatchDType(array, DeviceOrd::CPU(), [&](auto&& in) {
linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
return std::apply(in, linalg::UnravelIndex<D>(i, shape));
});
@@ -548,7 +564,7 @@ void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, Da
CHECK(key);
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, Context::kCpuId);
auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface {
linalg::ArrayInterface(t)
@@ -723,11 +739,14 @@ void MetaInfo::SynchronizeNumberOfColumns() {
namespace {
template <typename T>
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
<< "Data is resided on a different device than `gpu_id`. "
<< "Device that data is on: " << v.DeviceIdx() << ", "
<< "`gpu_id` for XGBoost: " << device;
bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
if (!valid) {
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
"the booster. The device ordinal of the data is: "
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
}
}
template <typename T, std::int32_t D>
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
CheckDevice(device, *v.Data());
@@ -806,10 +825,10 @@ DMatrix::~DMatrix() {
}
}
DMatrix *TryLoadBinary(std::string fname, bool silent) {
int magic;
std::unique_ptr<dmlc::Stream> fi(
dmlc::Stream::Create(fname.c_str(), "r", true));
namespace {
DMatrix* TryLoadBinary(std::string fname, bool silent) {
std::int32_t magic;
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
if (fi != nullptr) {
common::PeekableInStream is(fi.get());
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
@@ -817,11 +836,10 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
dmlc::ByteSwap(&magic, sizeof(magic), 1);
}
if (magic == data::SimpleDMatrix::kMagic) {
DMatrix *dmat = new data::SimpleDMatrix(&is);
DMatrix* dmat = new data::SimpleDMatrix(&is);
if (!silent) {
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
<< " matrix with " << dmat->Info().num_nonzero_
<< " entries loaded from " << fname;
LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
<< dmat->Info().num_nonzero_ << " entries loaded from " << fname;
}
return dmat;
}
@@ -829,6 +847,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
}
return nullptr;
}
} // namespace
DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
auto need_split = false;
@@ -840,7 +859,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
}
std::string fname, cache_file;
size_t dlm_pos = uri.find('#');
auto dlm_pos = uri.find('#');
if (dlm_pos != std::string::npos) {
cache_file = uri.substr(dlm_pos + 1, uri.length());
fname = uri.substr(0, dlm_pos);
@@ -852,14 +871,11 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
for (size_t i = 0; i < cache_shards.size(); ++i) {
size_t pos = cache_shards[i].rfind('.');
if (pos == std::string::npos) {
os << cache_shards[i]
<< ".r" << collective::GetRank()
<< "-" << collective::GetWorldSize();
os << cache_shards[i] << ".r" << collective::GetRank() << "-"
<< collective::GetWorldSize();
} else {
os << cache_shards[i].substr(0, pos)
<< ".r" << collective::GetRank()
<< "-" << collective::GetWorldSize()
<< cache_shards[i].substr(pos, cache_shards[i].length());
os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
<< collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
}
if (i + 1 != cache_shards.size()) {
os << ':';
@@ -890,12 +906,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
}
data::ValidateFileFormat(fname);
DMatrix* dmat {nullptr};
DMatrix* dmat{nullptr};
if (cache_file.empty()) {
std::unique_ptr<dmlc::Parser<uint32_t>> parser(
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
fname = data::ValidateFileFormat(fname);
std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
data::FileAdapter adapter(parser.get());
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
cache_file, data_split_mode);

View File

@@ -45,7 +45,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
p_out->Reshape(array.shape);
return;
}
CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value.";
auto ptr_device = SetDeviceToPtr(array.data);
p_out->SetDevice(ptr_device);
@@ -67,7 +68,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
return;
}
p_out->Reshape(array.shape);
auto t = p_out->View(ptr_device);
auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
linalg::ElementWiseTransformDevice(
t,
[=] __device__(size_t i, T) {

View File

@@ -3,12 +3,20 @@
*/
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
#include "ellpack_page.h"
#include <xgboost/data.h>
// dummy implementation of EllpackPage in case CUDA is not used
namespace xgboost {
class EllpackPageImpl {};
class EllpackPageImpl {
common::HistogramCuts cuts_;
public:
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
};
EllpackPage::EllpackPage() = default;
@@ -32,5 +40,16 @@ size_t EllpackPage::Size() const {
return 0;
}
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
} // namespace xgboost
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP

View File

@@ -4,12 +4,17 @@
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <algorithm> // for copy
#include <utility> // for move
#include <vector> // for vector
#include "../common/categorical.h"
#include "../common/cuda_context.cuh"
#include "../common/hist_util.cuh"
#include "../common/random.h"
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "./ellpack_page.cuh"
#include "device_adapter.cuh" // for HasInfInData
#include "ellpack_page.h"
#include "gradient_index.h"
#include "xgboost/data.h"
@@ -32,6 +37,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }
void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
CHECK(impl_);
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
CHECK(impl_);
return impl_->Cuts();
}
// Bin each input data entry, store the bin indices in compressed form.
__global__ void CompressBinEllpackKernel(
common::CompressedBufferWriter wr,
@@ -128,7 +143,11 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
monitor_.Start("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
if (!param.hess.empty()) {
cuts_ = common::DeviceSketchWithHessian(ctx, dmat, param.max_bin, param.hess);
} else {
cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
}
monitor_.Stop("Quantiles");
monitor_.Start("InitCompressedData");
@@ -343,7 +362,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
auto d_csc_indptr = dh::ToSpan(csc_indptr);
auto bin_type = page.index.GetBinTypeSize();
common::CompressedBufferWriter writer{page.cut.TotalBins() + 1}; // +1 for null value
common::CompressedBufferWriter writer{page.cut.TotalBins() +
static_cast<std::size_t>(1)}; // +1 for null value
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
auto ridx = idx / row_stride;
@@ -387,8 +407,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
// copy gidx
common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
dh::device_vector<size_t> row_ptr(page.row_ptr);
dh::device_vector<size_t> row_ptr(page.row_ptr.size());
auto d_row_ptr = dh::ToSpan(row_ptr);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
#endif
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto null = accessor.NullValue();

View File

@@ -1,17 +1,18 @@
/*!
* Copyright 2019 by XGBoost Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
#include <thrust/binary_search.h>
#include <xgboost/data.h>
#include "../common/categorical.h"
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "../common/categorical.h"
#include <thrust/binary_search.h>
#include "ellpack_page.h"
namespace xgboost {
/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
@@ -194,8 +195,8 @@ class EllpackPageImpl {
base_rowid = row_id;
}
common::HistogramCuts& Cuts() { return cuts_; }
common::HistogramCuts const& Cuts() const { return cuts_; }
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
/*! \return Estimation of memory cost of this page. */
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
}
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
#endif // XGBOOST_DATA_ELLPACK_PAGE_CUH_

59
src/data/ellpack_page.h Normal file
View File

@@ -0,0 +1,59 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#include <memory> // for unique_ptr
#include "../common/hist_util.h" // for HistogramCuts
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, BatchParam
namespace xgboost {
class EllpackPageImpl;
/**
* @brief A page stored in ELLPACK format.
*
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
* including CUDA-specific implementation details in the header.
*/
class EllpackPage {
public:
/**
* @brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();
/**
* @brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
/*! \brief Destructor. */
~EllpackPage();
EllpackPage(EllpackPage&& that);
/*! \return Number of instances in the page. */
[[nodiscard]] size_t Size() const;
/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id);
[[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }
[[nodiscard]] common::HistogramCuts& Cuts();
[[nodiscard]] common::HistogramCuts const& Cuts() const;
private:
std::unique_ptr<EllpackPageImpl> impl_;
};
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_

View File

@@ -1,60 +1,59 @@
/*!
* Copyright 2019-2021 XGBoost contributors
/**
* Copyright 2019-2023, XGBoost contributors
*/
#include <xgboost/data.h>
#include <dmlc/registry.h>
#include <cstddef> // for size_t
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
#include "ellpack_page.cuh"
#include "sparse_page_writer.h"
#include "histogram_cut_format.h"
namespace xgboost {
namespace data {
#include "histogram_cut_format.h" // for ReadHistogramCuts, WriteHistogramCuts
#include "sparse_page_writer.h" // for SparsePageFormat
namespace xgboost::data {
DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
public:
bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
auto* impl = page->Impl();
if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
return false;
}
fi->Read(&impl->n_rows);
fi->Read(&impl->is_dense);
fi->Read(&impl->row_stride);
fi->Read(&impl->gidx_buffer.HostVector());
if (!fi->Read(&impl->n_rows)) {
return false;
}
if (!fi->Read(&impl->is_dense)) {
return false;
}
if (!fi->Read(&impl->row_stride)) {
return false;
}
if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
return false;
}
if (!fi->Read(&impl->base_rowid)) {
return false;
}
return true;
}
size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
size_t bytes = 0;
size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
std::size_t bytes{0};
auto* impl = page.Impl();
bytes += WriteHistogramCuts(impl->Cuts(), fo);
fo->Write(impl->n_rows);
bytes += sizeof(impl->n_rows);
fo->Write(impl->is_dense);
bytes += sizeof(impl->is_dense);
fo->Write(impl->row_stride);
bytes += sizeof(impl->row_stride);
bytes += fo->Write(impl->n_rows);
bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride);
CHECK(!impl->gidx_buffer.ConstHostVector().empty());
fo->Write(impl->gidx_buffer.HostVector());
bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
fo->Write(impl->base_rowid);
bytes += sizeof(impl->base_rowid);
bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
bytes += fo->Write(impl->base_rowid);
return bytes;
}
};
XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
.describe("Raw ELLPACK binary data format.")
.set_body([]() {
return new EllpackPageRawFormat();
});
} // namespace data
} // namespace xgboost
.set_body([]() { return new EllpackPageRawFormat(); });
} // namespace xgboost::data

View File

@@ -5,10 +5,10 @@
#include <utility>
#include "ellpack_page.cuh"
#include "ellpack_page.h" // for EllpackPage
#include "ellpack_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void EllpackPageSource::Fetch() {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
@@ -31,5 +31,4 @@ void EllpackPageSource::Fetch() {
this->WriteCache();
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -6,17 +6,17 @@
#define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
#include <xgboost/data.h>
#include <memory>
#include <string>
#include <utility>
#include "../common/common.h"
#include "../common/hist_util.h"
#include "ellpack_page.h" // for EllpackPage
#include "sparse_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
bool is_dense_;
size_t row_stride_;
@@ -52,8 +52,7 @@ inline void EllpackPageSource::Fetch() {
(void)(is_dense_);
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
} // namespace data
} // namespace xgboost
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace xgboost::data
#endif // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_

51
src/data/file_iterator.cc Normal file
View File

@@ -0,0 +1,51 @@
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include "file_iterator.h"
#include <xgboost/logging.h> // for LogCheck_EQ, LogCheck_LE, CHECK_EQ, CHECK_LE, LOG, LOG_...
#include <filesystem> // for weakly_canonical, path, u8path
#include <map> // for map, operator==
#include <ostream> // for operator<<, basic_ostream, istringstream
#include <vector> // for vector
#include "../common/common.h" // for Split
#include "xgboost/string_view.h" // for operator<<, StringView
namespace xgboost::data {
std::string ValidateFileFormat(std::string const& uri) {
std::vector<std::string> name_args_cache = common::Split(uri, '#');
CHECK_LE(name_args_cache.size(), 2)
<< "Only one `#` is allowed in file path for cachefile specification";
std::vector<std::string> name_args = common::Split(name_args_cache[0], '?');
StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
CHECK_EQ(name_args.size(), 2) << msg;
std::map<std::string, std::string> args;
std::vector<std::string> arg_list = common::Split(name_args[1], '&');
for (size_t i = 0; i < arg_list.size(); ++i) {
std::istringstream is(arg_list[i]);
std::pair<std::string, std::string> kv;
CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
<< " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
<< " for value in arg " << i + 1;
args.insert(kv);
}
if (args.find("format") == args.cend()) {
LOG(FATAL) << msg;
}
auto path = common::Split(uri, '?')[0];
namespace fs = std::filesystem;
name_args[0] = fs::weakly_canonical(fs::u8path(path)).string();
if (name_args_cache.size() == 1) {
return name_args[0] + "?" + name_args[1];
} else {
return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
}
}
} // namespace xgboost::data

View File

@@ -4,46 +4,20 @@
#ifndef XGBOOST_DATA_FILE_ITERATOR_H_
#define XGBOOST_DATA_FILE_ITERATOR_H_
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <algorithm> // for max_element
#include <cstddef> // for size_t
#include <cstdint> // for uint32_t
#include <memory> // for unique_ptr
#include <string> // for string
#include <utility> // for move
#include "array_interface.h"
#include "dmlc/data.h"
#include "xgboost/c_api.h"
#include "xgboost/json.h"
#include "xgboost/linalg.h"
#include "dmlc/data.h" // for RowBlock, Parser
#include "xgboost/c_api.h" // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
#include "xgboost/linalg.h" // for ArrayInterfaceStr, MakeVec
#include "xgboost/logging.h" // for CHECK
namespace xgboost {
namespace data {
inline void ValidateFileFormat(std::string const& uri) {
std::vector<std::string> name_cache = common::Split(uri, '#');
CHECK_LE(name_cache.size(), 2)
<< "Only one `#` is allowed in file path for cachefile specification";
std::vector<std::string> name_args = common::Split(name_cache[0], '?');
CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path.";
StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
CHECK_EQ(name_args.size(), 2) << msg;
std::map<std::string, std::string> args;
std::vector<std::string> arg_list = common::Split(name_args[1], '&');
for (size_t i = 0; i < arg_list.size(); ++i) {
std::istringstream is(arg_list[i]);
std::pair<std::string, std::string> kv;
CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
<< " for key in arg " << i + 1;
CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
<< " for value in arg " << i + 1;
args.insert(kv);
}
if (args.find("format") == args.cend()) {
LOG(FATAL) << msg;
}
}
namespace xgboost::data {
[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
/**
* An iterator for implementing external memory support with file inputs. Users of
@@ -72,8 +46,7 @@ class FileIterator {
public:
FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
: uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} {
ValidateFileFormat(uri_);
: uri_{ValidateFileFormat(std::move(uri))}, part_idx_{part_index}, n_parts_{num_parts} {
XGProxyDMatrixCreate(&proxy_);
}
~FileIterator() {
@@ -132,6 +105,5 @@ inline int Next(DataIterHandle self) {
return static_cast<FileIterator*>(self)->Next();
}
} // namespace fileiter
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_FILE_ITERATOR_H_

View File

@@ -7,13 +7,12 @@
#include <algorithm>
#include <limits>
#include <memory>
#include <utility> // std::forward
#include <utility> // for forward
#include "../common/column_matrix.h"
#include "../common/hist_util.h"
#include "../common/numeric.h"
#include "../common/threading_utils.h"
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
namespace xgboost {
@@ -21,7 +20,7 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch,
common::Span<float> hess)
common::Span<float const> hess)
: max_numeric_bins_per_feat{max_bins_per_feat} {
CHECK(p_fmat->SingleColBlock());
// We use sorted sketching for approx tree method since it's more efficient in
@@ -29,7 +28,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);
const uint32_t nbins = cut.Ptrs().back();
hit_count.resize(nbins, 0);
hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
hit_count_tloc_.resize(ctx->Threads() * nbins, 0);
size_t new_size = 1;
@@ -37,8 +36,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
new_size += batch.Size();
}
row_ptr.resize(new_size);
row_ptr[0] = 0;
row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});
const bool isDense = p_fmat->IsDense();
this->isDense_ = isDense;
@@ -61,8 +59,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
bst_bin_t max_bin_per_feat)
: row_ptr(info.num_row_ + 1, 0),
hit_count(cuts.TotalBins(), 0),
: row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
cut{std::forward<common::HistogramCuts>(cuts)},
max_numeric_bins_per_feat(max_bin_per_feat),
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -95,12 +93,10 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
isDense_{isDense} {
CHECK_GE(n_threads, 1);
CHECK_EQ(row_ptr.size(), 0);
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
// it should be reassigned
row_ptr.resize(batch.Size() + 1, 0);
row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
const uint32_t nbins = cut.Ptrs().back();
hit_count.resize(nbins, 0);
hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
hit_count_tloc_.resize(n_threads * nbins, 0);
this->PushBatch(batch, ft, n_threads);
@@ -128,20 +124,45 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
#undef INSTANTIATION_PUSH
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
// Must resize instead of allocating a new one. This function is called everytime a
// new batch is pushed, and we grow the size accordingly without loosing the data the
// previous batches.
using T = decltype(t);
std::size_t n_bytes = sizeof(T) * n_index;
CHECK_GE(n_bytes, this->data.size());
auto resource = this->data.Resource();
decltype(this->data) new_vec;
if (!resource) {
CHECK(this->data.empty());
new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
} else {
CHECK(resource->Type() == common::ResourceHandler::kMalloc);
auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
CHECK(malloc_resource);
malloc_resource->Resize(n_bytes);
// gcc-11.3 doesn't work if DataAs is used.
std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
}
this->data = std::move(new_vec);
this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
};
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
isDense) {
// compress dense index to uint8
index.SetBinTypeSize(common::kUint8BinsTypeSize);
index.Resize((sizeof(uint8_t)) * n_index);
make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
isDense) {
// compress dense index to uint16
index.SetBinTypeSize(common::kUint16BinsTypeSize);
index.Resize((sizeof(uint16_t)) * n_index);
make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
} else {
index.SetBinTypeSize(common::kUint32BinsTypeSize);
index.Resize((sizeof(uint32_t)) * n_index);
// no compression
make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
}
}
@@ -214,11 +235,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
return std::numeric_limits<float>::quiet_NaN();
}
bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
return this->columns_->Read(fi, this->cut.Ptrs().data());
}
size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
return this->columns_->Write(fo);
}
} // namespace xgboost

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2022 by XGBoost Contributors
/**
* Copyright 2022-2023, XGBoost Contributors
*/
#include <memory> // std::unique_ptr
@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
}
void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
std::vector<size_t>* p_out) {
common::RefResourceView<std::size_t>* p_out) {
auto& row_ptr = *p_out;
row_ptr.resize(page->Size() + 1, 0);
row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
if (page->is_dense) {
std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
} else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
}
this->hit_count.resize(n_bins_total, 0);
this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
this->GatherHitCount(ctx->Threads(), n_bins_total);
// sanity checks

View File

@@ -9,13 +9,14 @@
#include <atomic> // for atomic
#include <cinttypes> // for uint32_t
#include <cstddef> // for size_t
#include <memory>
#include <memory> // for make_unique
#include <vector>
#include "../common/categorical.h"
#include "../common/error_msg.h" // for InfInData
#include "../common/hist_util.h"
#include "../common/numeric.h"
#include "../common/ref_resource_view.h" // for RefResourceView
#include "../common/threading_utils.h"
#include "../common/transform_iterator.h" // for MakeIndexTransformIter
#include "adapter.h"
@@ -25,9 +26,11 @@
namespace xgboost {
namespace common {
class ColumnMatrix;
class AlignedFileWriteStream;
} // namespace common
/*!
* \brief preprocessed global index matrix, in CSR format
/**
* @brief preprocessed global index matrix, in CSR format.
*
* Transform floating values to integer index in histogram This is a global histogram
* index for CPU histogram. On GPU ellpack page is used.
@@ -133,20 +136,22 @@ class GHistIndexMatrix {
}
public:
/*! \brief row pointer to rows by element position */
std::vector<size_t> row_ptr;
/*! \brief The index data */
/** @brief row pointer to rows by element position */
common::RefResourceView<std::size_t> row_ptr;
/** @brief data storage for index. */
common::RefResourceView<std::uint8_t> data;
/** @brief The histogram index. */
common::Index index;
/*! \brief hit count of each index, used for constructing the ColumnMatrix */
std::vector<size_t> hit_count;
/*! \brief The corresponding cuts */
/** @brief hit count of each index, used for constructing the ColumnMatrix */
common::RefResourceView<std::size_t> hit_count;
/** @brief The corresponding cuts */
common::HistogramCuts cut;
/** \brief max_bin for each feature. */
/** @brief max_bin for each feature. */
bst_bin_t max_numeric_bins_per_feat;
/*! \brief base row index for current page (used by external memory) */
size_t base_rowid{0};
/** @brief base row index for current page (used by external memory) */
bst_row_t base_rowid{0};
bst_bin_t MaxNumBinPerFeat() const {
[[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
}
@@ -155,7 +160,7 @@ class GHistIndexMatrix {
* \brief Constrcutor for SimpleDMatrix.
*/
GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
/**
* \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
* for push batch.
@@ -218,29 +223,30 @@ class GHistIndexMatrix {
}
}
bool IsDense() const {
return isDense_;
}
[[nodiscard]] bool IsDense() const { return isDense_; }
void SetDense(bool is_dense) { isDense_ = is_dense; }
/**
* \brief Get the local row index.
* @brief Get the local row index.
*/
size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
[[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
[[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
[[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
bool ReadColumnPage(dmlc::SeekStream* fi);
size_t WriteColumnPage(dmlc::Stream* fo) const;
[[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
[[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
common::ColumnMatrix const& Transpose() const;
[[nodiscard]] common::ColumnMatrix const& Transpose() const;
bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
[[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
bool is_cat) const;
[[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
[[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
std::vector<float> const& values, std::vector<float> const& mins,
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
[[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
private:
std::unique_ptr<common::ColumnMatrix> columns_;
@@ -294,5 +300,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
}
});
}
} // namespace xgboost
} // namespace xgboost
#endif // XGBOOST_DATA_GRADIENT_INDEX_H_

View File

@@ -1,38 +1,49 @@
/*!
* Copyright 2021-2022 XGBoost contributors
/**
* Copyright 2021-2023 XGBoost contributors
*/
#include "sparse_page_writer.h"
#include "gradient_index.h"
#include "histogram_cut_format.h"
#include <cstddef> // for size_t
#include <cstdint> // for uint8_t
#include <type_traits> // for underlying_type_t
#include <vector> // for vector
namespace xgboost {
namespace data {
#include "../common/io.h" // for AlignedResourceReadStream
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
#include "gradient_index.h" // for GHistIndexMatrix
#include "histogram_cut_format.h" // for ReadHistogramCuts
#include "sparse_page_writer.h" // for SparsePageFormat
namespace xgboost::data {
class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
public:
bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
CHECK(fi);
if (!ReadHistogramCuts(&page->cut, fi)) {
return false;
}
// indptr
fi->Read(&page->row_ptr);
// data
std::vector<uint8_t> data;
if (!fi->Read(&data)) {
if (!common::ReadVec(fi, &page->row_ptr)) {
return false;
}
page->index.Resize(data.size());
std::copy(data.cbegin(), data.cend(), page->index.begin());
// bin type
// data
// - bin type
// Old gcc doesn't support reading from enum.
std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
if (!fi->Read(&uint_bin_type)) {
return false;
}
common::BinTypeSize size_type =
static_cast<common::BinTypeSize>(uint_bin_type);
page->index.SetBinTypeSize(size_type);
common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
// - index buffer
if (!common::ReadVec(fi, &page->data)) {
return false;
}
// - index
page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
// hit count
if (!fi->Read(&page->hit_count)) {
if (!common::ReadVec(fi, &page->hit_count)) {
return false;
}
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -50,38 +61,35 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
page->index.SetBinOffset(page->cut.Ptrs());
}
page->ReadColumnPage(fi);
if (!page->ReadColumnPage(fi)) {
return false;
}
return true;
}
size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
size_t bytes = 0;
std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
std::size_t bytes = 0;
bytes += WriteHistogramCuts(page.cut, fo);
// indptr
fo->Write(page.row_ptr);
bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
sizeof(uint64_t);
bytes += common::WriteVec(fo, page.row_ptr);
// data
std::vector<uint8_t> data(page.index.begin(), page.index.end());
fo->Write(data);
bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
// bin type
std::underlying_type_t<common::BinTypeSize> uint_bin_type =
page.index.GetBinTypeSize();
fo->Write(uint_bin_type);
bytes += sizeof(page.index.GetBinTypeSize());
// - bin type
std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
bytes += fo->Write(uint_bin_type);
// - index buffer
std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
if (!data.empty()) {
bytes += fo->Write(data.data(), data.size());
}
// hit count
fo->Write(page.hit_count);
bytes +=
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
sizeof(uint64_t);
bytes += common::WriteVec(fo, page.hit_count);
// max_bins, base row, is_dense
fo->Write(page.max_numeric_bins_per_feat);
bytes += sizeof(page.max_numeric_bins_per_feat);
fo->Write(page.base_rowid);
bytes += sizeof(page.base_rowid);
fo->Write(page.IsDense());
bytes += sizeof(page.IsDense());
bytes += fo->Write(page.max_numeric_bins_per_feat);
bytes += fo->Write(page.base_rowid);
bytes += fo->Write(page.IsDense());
bytes += page.WriteColumnPage(fo);
return bytes;
@@ -93,6 +101,4 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
.describe("Raw GHistIndex binary data format.")
.set_body([]() { return new GHistIndexRawFormat(); });
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -1,10 +1,9 @@
/*!
* Copyright 2021-2022 by XGBoost Contributors
/**
* Copyright 2021-2023, XGBoost Contributors
*/
#include "gradient_index_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void GradientIndexPageSource::Fetch() {
if (!this->ReadCache()) {
if (count_ != 0 && !sync_) {
@@ -21,5 +20,4 @@ void GradientIndexPageSource::Fetch() {
this->WriteCache();
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -1,36 +1,38 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
#include "../common/hist_util.h"
#include <dmlc/io.h> // for Stream
namespace xgboost {
namespace data {
inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
if (!fi->Read(&cuts->cut_values_.HostVector())) {
#include <cstddef> // for size_t
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "../common/ref_resource_view.h" // for WriteVec, ReadVec
namespace xgboost::data {
inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
return false;
}
if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
return false;
}
if (!fi->Read(&cuts->min_vals_.HostVector())) {
if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
return false;
}
return true;
}
inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
size_t bytes = 0;
fo->Write(cuts.cut_values_.ConstHostVector());
bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
fo->Write(cuts.cut_ptrs_.ConstHostVector());
bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
fo->Write(cuts.min_vals_.ConstHostVector());
bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
common::AlignedFileWriteStream *fo) {
std::size_t bytes = 0;
bytes += common::WriteVec(fo, cuts.Values());
bytes += common::WriteVec(fo, cuts.Ptrs());
bytes += common::WriteVec(fo, cuts.MinValues());
return bytes;
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_

View File

@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
bool valid = iter.Next();
CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
auto d = MakeProxy(proxy_)->DeviceIdx();
auto pctx = MakeProxy(proxy_)->Ctx();
Context ctx;
ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
ctx.UpdateAllowUnknown(
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
// hardcoded parameter.
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
@@ -240,9 +241,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
* Generate gradient index.
*/
this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
size_t rbegin = 0;
size_t prev_sum = 0;
size_t i = 0;
std::size_t rbegin = 0;
std::size_t prev_sum = 0;
std::size_t i = 0;
while (iter.Next()) {
HostAdapterDispatch(proxy, [&](auto const& batch) {
proxy->Info().num_nonzero_ = batch_nnz[i];

View File

@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
dh::XGBCachingDeviceAllocator<char> alloc;
auto num_rows = [&]() {
return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
};
auto num_cols = [&]() {
return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
};
size_t row_stride = 0;
@@ -86,7 +86,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
get_device());
auto* p_sketch = &sketch_containers.back();
proxy->Info().weights_.SetDevice(get_device());
Dispatch(proxy, [&](auto const& value) {
cuda_impl::Dispatch(proxy, [&](auto const& value) {
common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
});
}
@@ -94,7 +94,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
accumulated_rows += batch_rows;
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
}));
@@ -129,7 +129,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
sketch_containers.clear();
sketch_containers.shrink_to_fit();
final_sketch.MakeCuts(&cuts);
final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
} else {
GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
}
@@ -137,7 +137,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
this->info_.num_row_ = accumulated_rows;
this->info_.num_nonzero_ = nnz;
auto init_page = [this, &proxy, &cuts, row_stride, accumulated_rows, get_device]() {
auto init_page = [this, &cuts, row_stride, accumulated_rows, get_device]() {
if (!ellpack_) {
// Should be put inside the while loop to protect against empty batch. In
// that case device id is invalid.
@@ -165,14 +165,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
Dispatch(proxy, [=](auto const& value) {
cuda_impl::Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
});
auto is_dense = this->IsDense();
proxy->Info().feature_types.SetDevice(get_device());
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
auto new_impl = Dispatch(proxy, [&](auto const& value) {
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
d_feature_types, row_stride, rows, cuts);
});

View File

@@ -1,14 +1,13 @@
/*!
* Copyright 2021 by Contributors
/**
* Copyright 2021-2023, XGBoost Contributors
* \file proxy_dmatrix.cc
*/
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
void DMatrixProxy::SetArrayData(char const *c_interface) {
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
namespace xgboost::data {
void DMatrixProxy::SetArrayData(StringView interface_str) {
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
@@ -25,5 +24,38 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId;
}
} // namespace data
} // namespace xgboost
namespace cuda_impl {
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
std::shared_ptr<DMatrixProxy> proxy, float missing);
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
float) {
return nullptr;
}
#endif // XGBOOST_USE_CUDA
} // namespace cuda_impl
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
std::shared_ptr<DMatrixProxy> proxy,
float missing) {
bool type_error{false};
std::shared_ptr<DMatrix> p_fmat{nullptr};
if (proxy->Ctx()->IsCPU()) {
p_fmat = data::HostAdapterDispatch<false>(
proxy.get(),
[&](auto const &adapter) {
auto p_fmat =
std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
return p_fmat;
},
&type_error);
} else {
p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
}
CHECK(p_fmat) << "Failed to fallback.";
p_fmat->Info() = proxy->Info().Copy();
return p_fmat;
}
} // namespace xgboost::data

View File

@@ -1,35 +1,47 @@
/*!
* Copyright 2020-2022, XGBoost contributors
/**
* Copyright 2020-2023, XGBoost contributors
*/
#include "proxy_dmatrix.h"
#include "device_adapter.cuh"
#include "proxy_dmatrix.cuh"
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
auto const& value = adapter->Value();
auto adapter{std::make_shared<CudfAdapter>(interface_str)};
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (ctx_.gpu_id < 0) {
if (adapter->DeviceIdx() < 0) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_.gpu_id = dh::CurrentDevice();
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
}
void DMatrixProxy::FromCudaArray(StringView interface_str) {
std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (ctx_.gpu_id < 0) {
if (adapter->DeviceIdx() < 0) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_.gpu_id = dh::CurrentDevice();
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
}
} // namespace data
} // namespace xgboost
namespace cuda_impl {
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
std::shared_ptr<DMatrixProxy> proxy,
float missing) {
return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
return p_fmat;
});
}
} // namespace cuda_impl
} // namespace xgboost::data

View File

@@ -6,19 +6,34 @@
#include "device_adapter.cuh"
#include "proxy_dmatrix.h"
namespace xgboost::data {
template <typename Fn>
namespace xgboost::data::cuda_impl {
template <bool get_value = true, typename Fn>
decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
return fn(value);
}
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
return fn(value);
}
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
return fn(value);
}
}
}
} // namespace xgboost::data
} // namespace xgboost::data::cuda_impl

View File

@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
void SetArrayData(char const* c_interface);
void SetArrayData(StringView interface_str);
void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
bst_feature_t n_features, bool on_host);
@@ -114,28 +114,62 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
return typed;
}
template <typename Fn>
/**
* @brief Dispatch function call based on input type.
*
* @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
* @tparam Fn The type of the function to be dispatched.
*
* @param proxy The proxy object holding the reference to the input.
* @param fn The function to be dispatched.
* @param type_error[out] Set to ture if it's not null and the input data is not recognized by
* the host.
*
* @return The return value of the function being dispatched.
*/
template <bool get_value = true, typename Fn>
decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
return fn(value);
}
if (type_error) {
*type_error = false;
}
return fn(value);
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
return fn(value);
}
if (type_error) {
*type_error = false;
}
return fn(value);
} else {
if (type_error) {
*type_error = true;
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
}
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
if constexpr (get_value) {
return std::result_of_t<Fn(
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
} else {
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
}
}
}
/**
* @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
*/
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
std::shared_ptr<DMatrixProxy> proxy, float missing);
} // namespace xgboost::data
#endif // XGBOOST_DATA_PROXY_DMATRIX_H_

View File

@@ -8,21 +8,21 @@
#include <algorithm>
#include <limits>
#include <numeric> // for accumulate
#include <type_traits>
#include <vector>
#include "../common/error_msg.h" // for InconsistentMaxBin
#include "../common/random.h"
#include "../common/threading_utils.h"
#include "../collective/communicator-inl.h" // for GetWorldSize, GetRank, Allgather
#include "../common/error_msg.h" // for InconsistentMaxBin
#include "./simple_batch_iterator.h"
#include "adapter.h"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.h" // for EllpackPage
#include "gradient_index.h"
#include "xgboost/c_api.h"
#include "xgboost/data.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
MetaInfo& SimpleDMatrix::Info() { return info_; }
const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -97,6 +97,10 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
// column page doesn't exist, generate it
if (!column_page_) {
auto n = std::numeric_limits<decltype(Entry::index)>::max();
if (this->sparse_page_->Size() > n) {
error::MaxSampleSize(n);
}
column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
}
auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -106,6 +110,10 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
// Sorted column page doesn't exist, generate it
if (!sorted_column_page_) {
auto n = std::numeric_limits<decltype(Entry::index)>::max();
if (this->sparse_page_->Size() > n) {
error::MaxSampleSize(n);
}
sorted_column_page_.reset(
new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
sorted_column_page_->SortRows(ctx->Threads());
@@ -427,5 +435,4 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
fmat_ctx_ = ctx;
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -32,7 +32,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
#endif
Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
CHECK(adapter->NumRows() != kAdapterUnknownSize);
CHECK(adapter->NumColumns() != kAdapterUnknownSize);

View File

@@ -8,7 +8,6 @@
#include "./sparse_page_dmatrix.h"
#include "../collective/communicator-inl.h"
#include "./simple_batch_iterator.h"
#include "batch_utils.h" // for RegenGHist
#include "gradient_index.h"
@@ -165,7 +164,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
const BatchParam &param) {
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
this->InitializeSparsePage(ctx);
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {

View File

@@ -1,17 +1,23 @@
/**
* Copyright 2021-2023 by XGBoost contributors
*/
#include <memory> // for unique_ptr
#include "../common/hist_util.cuh"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "../common/hist_util.h" // for HistogramCuts
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.cuh"
#include "sparse_page_dmatrix.h"
#include "sparse_page_source.h"
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for BatchParam
namespace xgboost::data {
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
const BatchParam& param) {
CHECK(ctx->IsCUDA());
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
size_t row_stride = 0;
@@ -21,8 +27,13 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
cache_info_.erase(id);
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
std::unique_ptr<common::HistogramCuts> cuts;
cuts.reset(
new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
if (!param.hess.empty()) {
cuts = std::make_unique<common::HistogramCuts>(
common::DeviceSketchWithHessian(ctx, this, param.max_bin, param.hess));
} else {
cuts =
std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin));
}
this->InitializeSparsePage(ctx); // reset after use.
row_stride = GetRowStride(this);
@@ -31,10 +42,10 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
batch_param_ = param;
auto ft = this->info_.feature_types.ConstDeviceSpan();
ellpack_page_source_.reset(); // release resources.
ellpack_page_source_.reset(new EllpackPageSource(
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
} else {
CHECK(sparse_page_source_);
ellpack_page_source_->Reset();

View File

@@ -7,9 +7,6 @@
#ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
#define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
#include <xgboost/data.h>
#include <xgboost/logging.h>
#include <algorithm>
#include <map>
#include <memory>
@@ -20,35 +17,33 @@
#include "ellpack_page_source.h"
#include "gradient_index_page_source.h"
#include "sparse_page_source.h"
#include "xgboost/data.h"
#include "xgboost/logging.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
/**
* \brief DMatrix used for external memory.
*
* The external memory is created for controlling memory usage by splitting up data into
* multiple batches. However that doesn't mean we will actually process exact 1 batch at
* a time, which would be terribly slow considering that we have to loop through the
* whole dataset for every tree split. So we use async pre-fetch and let caller to decide
* how many batches it wants to process by returning data as shared pointer. The caller
* can use async function to process the data or just stage those batches, making the
* decision is out of the scope for sparse page dmatrix. These 2 optimizations might
* defeat the purpose of splitting up dataset since if you load all the batches then the
* memory usage is even worse than using a single batch. Essentially we need to control
* how many batches can be in memory at the same time.
* multiple batches. However that doesn't mean we will actually process exactly 1 batch
* at a time, which would be terribly slow considering that we have to loop through the
* whole dataset for every tree split. So we use async to pre-fetch pages and let the
* caller to decide how many batches it wants to process by returning data as a shared
* pointer. The caller can use async function to process the data or just stage those
* batches based on its use cases. These two optimizations might defeat the purpose of
* splitting up dataset since if you stage all the batches then the memory usage might be
* even worse than using a single batch. As a result, we must control how many batches can
* be in memory at any given time.
*
* Right now the write to the cache is sequential operation and is blocking, reading from
* cache is async but with a hard coded limit of 4 pages as an heuristic. So by sparse
* dmatrix itself there can be only 9 pages in main memory (might be of different types)
* at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
* dependent pages. If the caller stops iteration at the middle and start again, then the
* number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
* caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
* sampling algo that samples only the first portion of data).
* Right now the write to the cache is a sequential operation and is blocking. Reading
* from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
* heuristic. So by sparse dmatrix itself there can be only 7 pages in main memory (might
* be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
* pages, 3 pre-fetched dependent pages.
*
* Of course if the caller decides to retain some batches to perform parallel processing,
* then we might load all pages in memory, which is also considered as a bug in caller's
* code. So if the algo supports external memory, it must be careful that queue for async
* code. So if the algo supports external memory, it must be careful that queue for async
* call must have an upper limit.
*
* Another assumption we make is that the data must be immutable so caller should never
@@ -101,7 +96,7 @@ class SparsePageDMatrix : public DMatrix {
MetaInfo &Info() override;
const MetaInfo &Info() const override;
Context const *Ctx() const override { return &fmat_ctx_; }
// The only DMatrix implementation that returns false.
bool SingleColBlock() const override { return false; }
DMatrix *Slice(common::Span<int32_t const>) override {
LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -153,6 +148,5 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
}
return id;
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

View File

@@ -1,59 +1,57 @@
/*!
* Copyright (c) 2015-2021 by Contributors
/**
* Copyright 2015-2023, XGBoost Contributors
* \file sparse_page_raw_format.cc
* Raw binary format of sparse page.
*/
#include <xgboost/data.h>
#include <dmlc/registry.h>
#include "xgboost/logging.h"
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "../common/ref_resource_view.h" // for WriteVec
#include "./sparse_page_writer.h"
#include "xgboost/data.h"
#include "xgboost/logging.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
template<typename T>
template <typename T>
class SparsePageRawFormat : public SparsePageFormat<T> {
public:
bool Read(T* page, dmlc::SeekStream* fi) override {
bool Read(T* page, common::AlignedResourceReadStream* fi) override {
auto& offset_vec = page->offset.HostVector();
if (!fi->Read(&offset_vec)) {
if (!common::ReadVec(fi, &offset_vec)) {
return false;
}
auto& data_vec = page->data.HostVector();
CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
data_vec.resize(offset_vec.back());
if (page->data.Size() != 0) {
size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
(page->data).Size() * sizeof(Entry));
CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
<< "Invalid SparsePage file";
if (!common::ReadVec(fi, &data_vec)) {
return false;
}
}
if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
return false;
}
fi->Read(&page->base_rowid, sizeof(page->base_rowid));
return true;
}
size_t Write(const T& page, dmlc::Stream* fo) override {
std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
const auto& offset_vec = page.offset.HostVector();
const auto& data_vec = page.data.HostVector();
CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
CHECK_EQ(offset_vec.back(), page.data.Size());
fo->Write(offset_vec);
auto bytes = page.MemCostBytes();
bytes += sizeof(uint64_t);
std::size_t bytes{0};
bytes += common::WriteVec(fo, offset_vec);
if (page.data.Size() != 0) {
fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
bytes += common::WriteVec(fo, data_vec);
}
fo->Write(&page.base_rowid, sizeof(page.base_rowid));
bytes += sizeof(page.base_rowid);
bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
return bytes;
}
private:
/*! \brief external memory column offset */
std::vector<size_t> disk_offset_;
};
XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -74,5 +72,4 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
return new SparsePageRawFormat<SortedCSCPage>();
});
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -1,33 +1,31 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include "../common/device_helpers.cuh" // for CurrentDevice
#include "proxy_dmatrix.cuh" // for Dispatch, DMatrixProxy
#include "simple_dmatrix.cuh" // for CopyToSparsePage
#include "sparse_page_source.h"
#include "proxy_dmatrix.cuh"
#include "simple_dmatrix.cuh"
namespace xgboost {
namespace data {
#include "xgboost/data.h" // for SparsePage
namespace xgboost::data {
namespace detail {
std::size_t NSamplesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
}
std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
}
} // namespace detail
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
auto device = proxy->DeviceIdx();
if (device < 0) {
device = dh::CurrentDevice();
}
CHECK_GE(device, 0);
Dispatch(proxy, [&](auto const &value) {
CopyToSparsePage(value, device, missing, page);
});
cuda_impl::Dispatch(proxy,
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -1,45 +1,49 @@
/*!
* Copyright 2014-2022 by XGBoost Contributors
/**
* Copyright 2014-2023, XGBoost Contributors
* \file sparse_page_source.h
*/
#ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
#include <algorithm> // std::min
#include <string>
#include <utility>
#include <vector>
#include <future>
#include <thread>
#include <algorithm> // for min
#include <atomic> // for atomic
#include <future> // for async
#include <map>
#include <memory>
#include <mutex> // for mutex
#include <string>
#include <thread>
#include <utility> // for pair, move
#include <vector>
#include "../common/common.h"
#include "../common/io.h" // for PrivateMmapConstStream
#include "../common/timer.h" // for Monitor, Timer
#include "adapter.h"
#include "proxy_dmatrix.h" // for DMatrixProxy
#include "sparse_page_writer.h" // for SparsePageFormat
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "adapter.h"
#include "sparse_page_writer.h"
#include "proxy_dmatrix.h"
#include "../common/common.h"
#include "../common/timer.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
inline void TryDeleteCacheFile(const std::string& file) {
if (std::remove(file.c_str()) != 0) {
// Don't throw, this is called in a destructor.
LOG(WARNING) << "Couldn't remove external memory cache file " << file
<< "; you may want to remove it manually";
<< "; you may want to remove it manually";
}
}
/**
* @brief Information about the cache including path and page offsets.
*/
struct Cache {
// whether the write to the cache is complete
bool written;
std::string name;
std::string format;
// offset into binary cache file.
std::vector<size_t> offset;
std::vector<std::uint64_t> offset;
Cache(bool w, std::string n, std::string fmt)
: written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -51,11 +55,24 @@ struct Cache {
return name + format;
}
std::string ShardName() {
[[nodiscard]] std::string ShardName() const {
return ShardName(this->name, this->format);
}
// The write is completed.
/**
* @brief Record a page with size of n_bytes.
*/
void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
/**
* @brief Returns the view start and length for the i^th page.
*/
[[nodiscard]] auto View(std::size_t i) const {
std::uint64_t off = offset.at(i);
std::uint64_t len = offset.at(i + 1) - offset[i];
return std::pair{off, len};
}
/**
* @brief Call this once the write for the cache is complete.
*/
void Commit() {
if (!written) {
std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -64,7 +81,7 @@ struct Cache {
}
};
// Prevents multi-threaded call.
// Prevents multi-threaded call to `GetBatches`.
class TryLockGuard {
std::mutex& lock_;
@@ -77,74 +94,128 @@ class TryLockGuard {
}
};
// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
class ExceHandler {
std::mutex mutex_;
std::atomic<bool> flag_{false};
std::exception_ptr curr_exce_{nullptr};
public:
template <typename Fn>
decltype(auto) Run(Fn&& fn) noexcept(true) {
try {
return fn();
} catch (dmlc::Error const& e) {
std::lock_guard<std::mutex> guard{mutex_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
}
flag_ = true;
} catch (std::exception const& e) {
std::lock_guard<std::mutex> guard{mutex_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
}
flag_ = true;
} catch (...) {
std::lock_guard<std::mutex> guard{mutex_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
}
flag_ = true;
}
return std::invoke_result_t<Fn>();
}
void Rethrow() noexcept(false) {
if (flag_) {
CHECK(curr_exce_);
std::rethrow_exception(curr_exce_);
}
}
};
/**
* @brief Base class for all page sources. Handles fetching, writing, and iteration.
*/
template <typename S>
class SparsePageSourceImpl : public BatchIteratorImpl<S> {
protected:
// Prevents calling this iterator from multiple places(or threads).
std::mutex single_threaded_;
// The current page.
std::shared_ptr<S> page_;
bool at_end_ {false};
float missing_;
int nthreads_;
std::int32_t nthreads_;
bst_feature_t n_features_;
uint32_t count_{0};
uint32_t n_batches_ {0};
// Index to the current page.
std::uint32_t count_{0};
// Total number of batches.
std::uint32_t n_batches_{0};
std::shared_ptr<Cache> cache_info_;
std::unique_ptr<dmlc::Stream> fo_;
using Ring = std::vector<std::future<std::shared_ptr<S>>>;
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
// can pre-fetch data in a ring.
std::unique_ptr<Ring> ring_{new Ring};
// Catching exception in pre-fetch threads to prevent segfault. Not always work though,
// OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
// OOM error should be rare.
ExceHandler exce_;
common::Monitor monitor_;
bool ReadCache() {
CHECK(!at_end_);
if (!cache_info_->written) {
return false;
}
if (fo_) {
fo_.reset(); // flush the data to disk.
if (ring_->empty()) {
ring_->resize(n_batches_);
}
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
// to let user adjust number of pre-fetched batches when needed.
uint32_t constexpr kPreFetch = 4;
uint32_t constexpr kPreFetch = 3;
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
size_t fetch_it = count_;
std::size_t fetch_it = count_;
for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
exce_.Rethrow();
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
fetch_it %= n_batches_; // ring
if (ring_->at(fetch_it).valid()) {
continue;
}
auto const *self = this; // make sure it's const
auto const* self = this; // make sure it's const
CHECK_LT(fetch_it, cache_info_->offset.size());
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
common::Timer timer;
timer.Start();
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
auto n = self->cache_info_->ShardName();
size_t offset = self->cache_info_->offset.at(fetch_it);
std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
fi->Seek(offset);
CHECK_EQ(fi->Tell(), offset);
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
auto page = std::make_shared<S>();
CHECK(fmt->Read(page.get(), fi.get()));
LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
this->exce_.Run([&] {
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
auto name = self->cache_info_->ShardName();
auto [offset, length] = self->cache_info_->View(fetch_it);
auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
CHECK(fmt->Read(page.get(), fi.get()));
});
return page;
});
}
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
n_prefetch_batches)
<< "Sparse DMatrix assumes forward iteration.";
monitor_.Start("Wait");
page_ = (*ring_)[count_].get();
CHECK(!(*ring_)[count_].valid());
monitor_.Stop("Wait");
exce_.Rethrow();
return true;
}
@@ -153,29 +224,41 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
common::Timer timer;
timer.Start();
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
if (!fo_) {
auto n = cache_info_->ShardName();
fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
}
auto bytes = fmt->Write(*page_, fo_.get());
timer.Stop();
auto name = cache_info_->ShardName();
std::unique_ptr<common::AlignedFileWriteStream> fo;
if (this->Iter() == 0) {
fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
} else {
fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
}
auto bytes = fmt->Write(*page_, fo.get());
timer.Stop();
// Not entirely accurate, the kernels doesn't have to flush the data.
LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
<< timer.ElapsedSeconds() << " seconds.";
cache_info_->offset.push_back(bytes);
cache_info_->Push(bytes);
}
virtual void Fetch() = 0;
public:
SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
uint32_t n_batches, std::shared_ptr<Cache> cache)
: missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
n_batches_{n_batches}, cache_info_{std::move(cache)} {}
SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
std::shared_ptr<Cache> cache)
: missing_{missing},
nthreads_{nthreads},
n_features_{n_features},
n_batches_{n_batches},
cache_info_{std::move(cache)} {
monitor_.Init(typeid(S).name()); // not pretty, but works for basic profiling
}
SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
~SparsePageSourceImpl() override {
// Don't orphan the threads.
for (auto& fu : *ring_) {
if (fu.valid()) {
fu.get();
@@ -183,18 +266,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
}
}
uint32_t Iter() const { return count_; }
[[nodiscard]] uint32_t Iter() const { return count_; }
const S &operator*() const override {
CHECK(page_);
return *page_;
}
std::shared_ptr<S const> Page() const override {
[[nodiscard]] std::shared_ptr<S const> Page() const override {
return page_;
}
bool AtEnd() const override {
[[nodiscard]] bool AtEnd() const override {
return at_end_;
}
@@ -202,20 +285,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
TryLockGuard guard{single_threaded_};
at_end_ = false;
count_ = 0;
// Pre-fetch for the next round of iterations.
this->Fetch();
}
};
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
// Push data from CUDA.
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
#else
inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
#endif
class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
// This is the source from the user.
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
DMatrixProxy* proxy_;
size_t base_row_id_ {0};
std::size_t base_row_id_{0};
void Fetch() final {
page_ = std::make_shared<SparsePage>();
@@ -244,7 +330,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
iter_{iter}, proxy_{proxy} {
if (!cache_info_->written) {
iter_.Reset();
CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
CHECK(iter_.Next()) << "Must have at least 1 batch.";
}
this->Fetch();
}
@@ -259,6 +345,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
}
if (at_end_) {
CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
cache_info_->Commit();
if (n_batches_ != 0) {
CHECK_EQ(count_, n_batches_);
@@ -371,6 +458,5 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
this->Fetch();
}
};
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

View File

@@ -1,52 +1,44 @@
/*!
* Copyright (c) 2014-2019 by Contributors
/**
* Copyright 2014-2023, XGBoost Contributors
* \file sparse_page_writer.h
* \author Tianqi Chen
*/
#ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
#define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
#include <xgboost/data.h>
#include <dmlc/io.h>
#include <vector>
#include <algorithm>
#include <cstring>
#include <string>
#include <utility>
#include <memory>
#include <functional>
#include <functional> // for function
#include <string> // for string
#if DMLC_ENABLE_STD_THREAD
#include <dmlc/concurrency.h>
#include <thread>
#endif // DMLC_ENABLE_STD_THREAD
namespace xgboost {
namespace data {
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "dmlc/io.h" // for Stream
#include "dmlc/registry.h" // for Registry, FunctionRegEntryBase
#include "xgboost/data.h" // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
namespace xgboost::data {
template<typename T>
struct SparsePageFormatReg;
/*!
* \brief Format specification of SparsePage.
/**
* @brief Format specification of various data formats like SparsePage.
*/
template<typename T>
template <typename T>
class SparsePageFormat {
public:
/*! \brief virtual destructor */
virtual ~SparsePageFormat() = default;
/*!
* \brief Load all the segments into page, advance fi to end of the block.
* \param page The data to read page into.
* \param fi the input stream of the file
* \return true of the loading as successful, false if end of file was reached
/**
* @brief Load all the segments into page, advance fi to end of the block.
*
* @param page The data to read page into.
* @param fi the input stream of the file
* @return true of the loading as successful, false if end of file was reached
*/
virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
/*!
* \brief save the data to fo, when a page was written.
* \param fo output stream
virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
/**
* @brief save the data to fo, when a page was written.
*
* @param fo output stream
*/
virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
};
/*!
@@ -105,6 +97,5 @@ struct SparsePageFormatReg
DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>, \
GHistIndexPageFmt, Name)
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2022 by XGBoost Contributors
/**
* Copyright 2014-2023, XGBoost Contributors
* \file gblinear.cc
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
* the update rule is parallel coordinate descent (shotgun)
@@ -26,10 +26,9 @@
#include "../common/timer.h"
#include "../common/common.h"
#include "../common/threading_utils.h"
#include "../common/error_msg.h"
namespace xgboost {
namespace gbm {
namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gblinear);
// training parameters
@@ -83,7 +82,16 @@ class GBLinear : public GradientBooster {
}
param_.UpdateAllowUnknown(cfg);
param_.CheckGPUSupport();
updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
if (param_.updater == "gpu_coord_descent") {
LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
R"(device="cuda", updater="coord_descent")");
}
if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
} else {
updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
}
updater_->Configure(cfg);
monitor_.Init("GBLinear");
}
@@ -133,7 +141,7 @@ class GBLinear : public GradientBooster {
this->updater_->SaveConfig(&j_updater);
}
void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, PredictionCacheEntry*,
void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
ObjFunction const*) override {
monitor_.Start("DoBoost");
@@ -172,11 +180,10 @@ class GBLinear : public GradientBooster {
}
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
unsigned) override {
bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
model_.LazyInitModel();
LinearCheckLayer(layer_begin);
auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
const int ngroup = model_.learner_model_param->num_output_group;
const size_t ncolumns = model_.learner_model_param->num_feature + 1;
// allocate space for (#features + bias) times #groups times #rows
@@ -210,8 +217,8 @@ class GBLinear : public GradientBooster {
}
}
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
unsigned layer_begin, unsigned /*layer_end*/,
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
bool) override {
LinearCheckLayer(layer_begin);
std::vector<bst_float>& contribs = out_contribs->HostVector();
@@ -224,9 +231,8 @@ class GBLinear : public GradientBooster {
std::fill(contribs.begin(), contribs.end(), 0);
}
std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const override {
[[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const override {
return model_.DumpModel(fmap, with_stats, format);
}
@@ -244,10 +250,9 @@ class GBLinear : public GradientBooster {
// The bias is the last weight
out_scores->resize(model_.weight.size() - learner_model_param_->num_output_group, 0);
auto n_groups = learner_model_param_->num_output_group;
linalg::TensorView<float, 2> scores{
*out_scores,
{learner_model_param_->num_feature, n_groups},
Context::kCpuId};
auto scores = linalg::MakeTensorView(DeviceOrd::CPU(),
common::Span{out_scores->data(), out_scores->size()},
learner_model_param_->num_feature, n_groups);
for (size_t i = 0; i < learner_model_param_->num_feature; ++i) {
for (bst_group_t g = 0; g < n_groups; ++g) {
scores(i, g) = model_[i][g];
@@ -255,7 +260,7 @@ class GBLinear : public GradientBooster {
}
}
bool UseGPU() const override {
[[nodiscard]] bool UseGPU() const override {
if (param_.updater == "gpu_coord_descent") {
return true;
} else {
@@ -269,12 +274,12 @@ class GBLinear : public GradientBooster {
monitor_.Start("PredictBatchInternal");
model_.LazyInitModel();
std::vector<bst_float> &preds = *out_preds;
auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
// start collecting the prediction
const int ngroup = model_.learner_model_param->num_output_group;
preds.resize(p_fmat->Info().num_row_ * ngroup);
auto base_score = learner_model_param_->BaseScore(Context::kCpuId);
auto base_score = learner_model_param_->BaseScore(DeviceOrd::CPU());
for (const auto &page : p_fmat->GetBatches<SparsePage>()) {
auto const& batch = page.GetView();
// output convention: nrow * k, where nrow is number of rows
@@ -355,5 +360,4 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
.set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
return new GBLinear(booster_config, ctx);
});
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

View File

@@ -9,7 +9,7 @@
#include <dmlc/omp.h>
#include <dmlc/parameter.h>
#include <algorithm>
#include <algorithm> // for equal
#include <cinttypes> // for uint32_t
#include <limits>
#include <memory>
@@ -18,9 +18,11 @@
#include <vector>
#include "../common/common.h"
#include "../common/error_msg.h" // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
#include "../common/random.h"
#include "../common/threading_utils.h"
#include "../common/timer.h"
#include "../data/proxy_dmatrix.h" // for DMatrixProxy, HostAdapterDispatch
#include "gbtree_model.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
@@ -38,9 +40,54 @@
namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gbtree);
namespace {
/** @brief Map the `tree_method` parameter to the `updater` parameter. */
std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
// Choose updaters according to tree_method parameters
if (ctx->IsCUDA()) {
common::AssertGPUSupport();
}
switch (tree_method) {
case TreeMethod::kAuto: // Use hist as default in 2.0
case TreeMethod::kHist: {
return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
[] { return "grow_gpu_hist"; });
}
case TreeMethod::kApprox: {
return ctx->DispatchDevice([] { return "grow_histmaker"; }, [] { return "grow_gpu_approx"; });
}
case TreeMethod::kExact:
CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
return "grow_colmaker,prune";
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
error::WarnDeprecatedGPUHist();
return "grow_gpu_hist";
}
default:
auto tm = static_cast<std::underlying_type_t<TreeMethod>>(tree_method);
LOG(FATAL) << "Unknown tree_method: `" << tm << "`.";
}
LOG(FATAL) << "unreachable";
return "";
}
bool UpdatersMatched(std::vector<std::string> updater_seq,
std::vector<std::unique_ptr<TreeUpdater>> const& updaters) {
if (updater_seq.size() != updaters.size()) {
return false;
}
return std::equal(updater_seq.cbegin(), updater_seq.cend(), updaters.cbegin(),
[](std::string const& name, std::unique_ptr<TreeUpdater> const& up) {
return name == up->Name();
});
}
} // namespace
void GBTree::Configure(Args const& cfg) {
this->cfg_ = cfg;
std::string updater_seq = tparam_.updater_seq;
tparam_.UpdateAllowUnknown(cfg);
tree_param_.UpdateAllowUnknown(cfg);
@@ -53,15 +100,13 @@ void GBTree::Configure(Args const& cfg) {
// configure predictors
if (!cpu_predictor_) {
cpu_predictor_ = std::unique_ptr<Predictor>(
Predictor::Create("cpu_predictor", this->ctx_));
cpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", this->ctx_));
}
cpu_predictor_->Configure(cfg);
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
auto n_gpus = common::AllVisibleGPUs();
if (!gpu_predictor_ && n_gpus != 0) {
gpu_predictor_ = std::unique_ptr<Predictor>(
Predictor::Create("gpu_predictor", this->ctx_));
if (!gpu_predictor_) {
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
}
if (n_gpus != 0) {
gpu_predictor_->Configure(cfg);
@@ -70,139 +115,41 @@ void GBTree::Configure(Args const& cfg) {
#if defined(XGBOOST_USE_ONEAPI)
if (!oneapi_predictor_) {
oneapi_predictor_ = std::unique_ptr<Predictor>(
Predictor::Create("oneapi_predictor", this->ctx_));
oneapi_predictor_ =
std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", this->ctx_));
}
oneapi_predictor_->Configure(cfg);
#endif // defined(XGBOOST_USE_ONEAPI)
monitor_.Init("GBTree");
specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) {
return arg.first == "updater";
});
if (specified_updater_ && !showed_updater_warning_) {
LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
"parameter. The `tree_method` parameter will be ignored. "
"Incorrect sequence of updaters will produce undefined "
"behavior. For common uses, we recommend using "
"`tree_method` parameter instead.";
// Don't drive users to silent XGBOost.
showed_updater_warning_ = true;
}
this->ConfigureUpdaters();
if (updater_seq != tparam_.updater_seq) {
updaters_.clear();
this->InitUpdater(cfg);
} else {
for (auto &up : updaters_) {
up->Configure(cfg);
}
}
configured_ = true;
}
// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
// whether external memory is used and how large is dataset. We can remove the dependency
// on DMatrix once `hist` tree method can handle external memory so that we can make it
// default.
void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
CHECK(this->configured_);
std::string updater_seq = tparam_.updater_seq;
CHECK(tparam_.GetInitialised());
tparam_.UpdateAllowUnknown(cfg);
this->PerformTreeMethodHeuristic(fmat);
this->ConfigureUpdaters();
// initialize the updaters only when needed.
if (updater_seq != tparam_.updater_seq) {
LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
this->updaters_.clear();
this->InitUpdater(cfg);
}
}
void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
// `updater` parameter was manually specified
specified_updater_ =
std::any_of(cfg.cbegin(), cfg.cend(), [](auto const& arg) { return arg.first == "updater"; });
if (specified_updater_) {
// This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
// tparam_ is set before calling this function.
if (tparam_.tree_method != TreeMethod::kAuto) {
return;
}
if (collective::IsDistributed()) {
LOG(INFO) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (!fmat->SingleColBlock()) {
LOG(INFO) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(INFO) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
} else {
tparam_.tree_method = TreeMethod::kExact;
error::WarnManualUpdater();
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
}
void GBTree::ConfigureUpdaters() {
if (specified_updater_) {
return;
if (!specified_updater_) {
this->tparam_.updater_seq = MapTreeMethodToUpdaters(ctx_, tparam_.tree_method);
}
// `updater` parameter was manually specified
/* Choose updaters according to tree_method parameters */
switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx' This
// choice is carried out in PerformTreeMethodHeuristic() before
// calling this function.
break;
case TreeMethod::kApprox:
tparam_.updater_seq = "grow_histmaker";
break;
case TreeMethod::kExact:
tparam_.updater_seq = "grow_colmaker,prune";
break;
case TreeMethod::kHist: {
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
"grow_quantile_histmaker.";
tparam_.updater_seq = "grow_quantile_histmaker";
break;
auto up_names = common::Split(tparam_.updater_seq, ',');
if (!UpdatersMatched(up_names, updaters_)) {
updaters_.clear();
for (auto const& name : up_names) {
std::unique_ptr<TreeUpdater> up(
TreeUpdater::Create(name.c_str(), ctx_, &model_.learner_model_param->task));
updaters_.push_back(std::move(up));
}
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
tparam_.updater_seq = "grow_gpu_hist";
break;
}
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
}
for (auto& up : updaters_) {
up->Configure(cfg);
}
}
void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_group_t,
HostDeviceVector<GradientPair>*)
void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
linalg::Matrix<GradientPair>*)
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
; // NOLINT
#else
@@ -211,16 +158,19 @@ void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_gro
}
#endif
void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_threads,
bst_group_t n_groups, bst_group_t group_id,
HostDeviceVector<GradientPair>* out_gpair) {
if (in_gpair->DeviceIdx() != Context::kCpuId) {
GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
void CopyGradient(Context const* ctx, linalg::Matrix<GradientPair> const* in_gpair,
bst_group_t group_id, linalg::Matrix<GradientPair>* out_gpair) {
out_gpair->SetDevice(ctx->Device());
out_gpair->Reshape(in_gpair->Shape(0), 1);
if (ctx->IsCUDA()) {
GPUCopyGradient(ctx, in_gpair, group_id, out_gpair);
} else {
std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
const auto& gpair_h = in_gpair->ConstHostVector();
common::ParallelFor(out_gpair->Size(), n_threads,
[&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
auto const& in = *in_gpair;
auto target_gpair = in.Slice(linalg::All(), group_id);
auto h_tmp = out_gpair->HostView();
auto h_in = in.HostView().Slice(linalg::All(), group_id);
CHECK_EQ(h_tmp.Size(), h_in.Size());
common::ParallelFor(h_in.Size(), ctx->Threads(), [&](auto i) { h_tmp(i) = h_in(i); });
}
}
@@ -249,21 +199,22 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
}
}
void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) {
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
}
TreesOneIter new_trees;
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
ConfigureWithKnownData(this->cfg_, p_fmat);
monitor_.Start("BoostNewTrees");
// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
// `gpu_id` be the single source of determining what algorithms to run, but that will
// break a lots of existing code.
auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
auto out = linalg::MakeTensorView(
device,
device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
predt->predictions.SetDevice(ctx_->Ordinal());
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
model_.learner_model_param->OutputLength());
CHECK_NE(n_groups, 0);
if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -296,12 +247,12 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
}
} else {
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
HostDeviceVector<GradientPair> tmp(in_gpair->Size() / n_groups, GradientPair(),
in_gpair->DeviceIdx());
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
ctx_->Ordinal()};
bool update_predict = true;
for (bst_target_t gid = 0; gid < n_groups; ++gid) {
node_position.clear();
CopyGradient(in_gpair, ctx_->Threads(), n_groups, gid, &tmp);
CopyGradient(ctx_, in_gpair, gid, &tmp);
TreesOneGroup ret;
BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -322,48 +273,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
this->CommitModel(std::move(new_trees));
}
void GBTree::InitUpdater(Args const& cfg) {
std::string tval = tparam_.updater_seq;
std::vector<std::string> ups = common::Split(tval, ',');
if (updaters_.size() != 0) {
// Assert we have a valid set of updaters.
CHECK_EQ(ups.size(), updaters_.size());
for (auto const& up : updaters_) {
bool contains = std::any_of(ups.cbegin(), ups.cend(),
[&up](std::string const& name) {
return name == up->Name();
});
if (!contains) {
std::stringstream ss;
ss << "Internal Error: " << " mismatched updater sequence.\n";
ss << "Specified updaters: ";
std::for_each(ups.cbegin(), ups.cend(),
[&ss](std::string const& name){
ss << name << " ";
});
ss << "\n" << "Actual updaters: ";
std::for_each(updaters_.cbegin(), updaters_.cend(),
[&ss](std::unique_ptr<TreeUpdater> const& updater){
ss << updater->Name() << " ";
});
LOG(FATAL) << ss.str();
}
}
// Do not push new updater in.
return;
}
// create new updaters
for (const std::string& pstr : ups) {
std::unique_ptr<TreeUpdater> up(
TreeUpdater::Create(pstr.c_str(), ctx_, &model_.learner_model_param->task));
up->Configure(cfg);
updaters_.push_back(std::move(up));
}
}
void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
std::vector<HostDeviceVector<bst_node_t>>* out_position,
TreesOneGroup* ret) {
std::vector<RegTree*> new_trees;
@@ -371,6 +281,7 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
// create the trees
for (int i = 0; i < model_.param.num_parallel_tree; ++i) {
if (tparam_.process_type == TreeProcessType::kDefault) {
CHECK(!updaters_.empty());
CHECK(!updaters_.front()->CanModifyTree())
<< "Updater: `" << updaters_.front()->Name() << "` "
<< "can not be used to create new trees. "
@@ -436,12 +347,7 @@ void GBTree::LoadConfig(Json const& in) {
// This would cause all trees to be pushed to trees_to_update
// e.g. updating a model, then saving and loading it would result in an empty model
tparam_.process_type = TreeProcessType::kDefault;
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
"Changing predictor to auto.";
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
}
std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
auto msg = StringView{
R"(
@@ -457,19 +363,32 @@ void GBTree::LoadConfig(Json const& in) {
LOG(WARNING) << msg << " Changing `tree_method` to `hist`.";
}
auto const& j_updaters = get<Object const>(in["updater"]);
std::vector<Json> updater_seq;
if (IsA<Object>(in["updater"])) {
// before 2.0
error::WarnOldSerialization();
for (auto const& kv : get<Object const>(in["updater"])) {
auto name = kv.first;
auto config = kv.second;
config["name"] = name;
updater_seq.push_back(config);
}
} else {
// after 2.0
auto const& j_updaters = get<Array const>(in["updater"]);
updater_seq = j_updaters;
}
updaters_.clear();
for (auto const& kv : j_updaters) {
auto name = kv.first;
for (auto const& config : updater_seq) {
auto name = get<String>(config["name"]);
if (n_gpus == 0 && name == "grow_gpu_hist") {
name = "grow_quantile_histmaker";
LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
}
std::unique_ptr<TreeUpdater> up{
TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
up->LoadConfig(kv.second);
updaters_.push_back(std::move(up));
updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
updaters_.back()->LoadConfig(config);
}
specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -491,13 +410,14 @@ void GBTree::SaveConfig(Json* p_out) const {
// language binding doesn't need to know about the forest size.
out["gbtree_model_param"] = ToJson(model_.param);
out["updater"] = Object();
out["updater"] = Array{};
auto& j_updaters = get<Array>(out["updater"]);
auto& j_updaters = out["updater"];
for (auto const& up : updaters_) {
j_updaters[up->Name()] = Object();
auto& j_up = j_updaters[up->Name()];
up->SaveConfig(&j_up);
for (auto const& up : this->updaters_) {
Json up_config{Object{}};
up_config["name"] = String{up->Name()};
up->SaveConfig(&up_config);
j_updaters.emplace_back(up_config);
}
out["specified_updater"] = Boolean{specified_updater_};
}
@@ -517,7 +437,6 @@ void GBTree::SaveModel(Json* p_out) const {
void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GradientBooster* out,
bool* out_of_bound) const {
CHECK(configured_);
CHECK(out);
auto p_gbtree = dynamic_cast<GBTree*>(out);
@@ -567,9 +486,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
}
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
bst_layer_t layer_begin, bst_layer_t layer_end) {
CHECK(configured_);
void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) const {
if (layer_end == 0) {
layer_end = this->BoostedRounds();
}
@@ -588,7 +506,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
CHECK_EQ(out_preds->version, 0);
}
auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
if (out_preds->version == 0) {
// out_preds->Size() can be non-zero as it's initialized here before any
// tree is built at the 0^th iterator.
@@ -608,52 +526,68 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
}
}
std::unique_ptr<Predictor> const &
GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
DMatrix *f_dmat) const {
CHECK(configured_);
if (tparam_.predictor != PredictorType::kAuto) {
if (tparam_.predictor == PredictorType::kGPUPredictor) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) {
// dispatch to const function.
this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
}
void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
bst_layer_t layer_end) const {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
if (p_m->Ctx()->Device() != this->ctx_->Device()) {
error::MismatchedDevices(this->ctx_, p_m->Ctx());
CHECK_EQ(out_preds->version, 0);
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
CHECK(proxy) << error::InplacePredictProxy();
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
return;
}
bool known_type = this->ctx_->DispatchDevice(
[&, begin = tree_begin, end = tree_end] {
return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
},
[&, begin = tree_begin, end = tree_end] {
return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
});
if (!known_type) {
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
CHECK(proxy) << error::InplacePredictProxy();
LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
}
}
[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
// Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
// prevent data copy.
if (f_dmat && !f_dmat->SingleColBlock()) {
if (ctx_->IsCPU()) {
return cpu_predictor_;
} else {
common::AssertGPUSupport();
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
#if defined(XGBOOST_USE_ONEAPI)
CHECK(oneapi_predictor_);
return oneapi_predictor_;
#else
common::AssertOneAPISupport();
#endif // defined(XGBOOST_USE_ONEAPI)
}
CHECK(cpu_predictor_);
return cpu_predictor_;
}
// Data comes from Device DMatrix.
auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
!f_dmat->PageExists<SparsePage>();
auto is_ellpack =
f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
// Data comes from device memory, like CuDF or CuPy.
auto is_from_device =
f_dmat && f_dmat->PageExists<SparsePage>() &&
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
auto on_device = is_ellpack || is_from_device;
// Use GPU Predictor if data is already on device and gpu_id is set.
if (on_device && ctx_->gpu_id >= 0) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
if (on_device && ctx_->IsCUDA()) {
common::AssertGPUSupport();
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
"CUDA/HIP support.";
return cpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
// GPU_Hist by default has prediction cache calculated from quantile values,
@@ -665,23 +599,19 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
// FIXME(trivialfis): Implement a better method for testing whether data
// is on device after DMatrix refactoring is done.
!on_device) {
!on_device && is_training) {
CHECK(cpu_predictor_);
return cpu_predictor_;
}
if (tparam_.tree_method == TreeMethod::kGPUHist) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
if (ctx_->IsCPU()) {
return cpu_predictor_;
} else {
common::AssertGPUSupport();
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
common::AssertGPUSupport();
return cpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
CHECK(cpu_predictor_);
return cpu_predictor_;
}
@@ -796,7 +726,7 @@ class Dart : public GBTree {
bool training, unsigned layer_begin,
unsigned layer_end) const {
CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
CHECK(predictor);
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
model_);
@@ -805,7 +735,7 @@ class Dart : public GBTree {
auto n_groups = model_.learner_model_param->num_output_group;
PredictionCacheEntry predts; // temporary storage for prediction
if (ctx_->gpu_id != Context::kCpuId) {
if (ctx_->IsCUDA()) {
predts.predictions.SetDevice(ctx_->gpu_id);
}
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@@ -860,15 +790,16 @@ class Dart : public GBTree {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
auto n_groups = model_.learner_model_param->num_output_group;
std::vector<Predictor const*> predictors {
cpu_predictor_.get(),
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA)
};
Predictor const* predictor{nullptr};
StringView msg{"Unsupported data type for inplace predict."};
if (ctx_->Device() != p_fmat->Ctx()->Device()) {
error::MismatchedDevices(ctx_, p_fmat->Ctx());
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
CHECK(proxy) << error::InplacePredictProxy();
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
return;
}
StringView msg{"Unsupported data type for inplace predict."};
PredictionCacheEntry predts;
if (ctx_->gpu_id != Context::kCpuId) {
predts.predictions.SetDevice(ctx_->gpu_id);
@@ -877,32 +808,29 @@ class Dart : public GBTree {
auto predict_impl = [&](size_t i) {
predts.predictions.Fill(0);
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
bool success = false;
for (auto const& p : predictors) {
if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
success = true;
predictor = p;
break;
}
}
CHECK(success) << msg;
} else {
predictor = this->GetPredictor().get();
bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
CHECK(success) << msg << std::endl
<< "Current Predictor: "
<< (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
: "gpu_predictor");
}
bool success = this->ctx_->DispatchDevice(
[&] {
return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
},
[&] {
return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
});
CHECK(success) << msg;
};
// Inplace predict is not used for training, so no need to drop tree.
for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
predict_impl(i);
if (i == tree_begin) {
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
this->ctx_->DispatchDevice(
[&] {
this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
model_);
},
[&] {
this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
model_);
});
}
// Multiple the tree weight
auto w = this->weight_drop_.at(i);
@@ -912,12 +840,12 @@ class Dart : public GBTree {
size_t n_rows = p_fmat->Info().num_row_;
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.DeviceIdx());
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
group);
} else {
auto base_score = model_.learner_model_param->BaseScore(Context::kCpuId);
auto base_score = model_.learner_model_param->BaseScore(DeviceOrd::CPU());
auto& h_predts = predts.predictions.HostVector();
auto& h_out_predts = p_out_preds->predictions.HostVector();
common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) {
@@ -932,26 +860,23 @@ class Dart : public GBTree {
std::vector<bst_float> *out_preds,
unsigned layer_begin, unsigned layer_end) override {
DropTrees(false);
auto &predictor = this->GetPredictor();
auto &predictor = this->GetPredictor(false);
uint32_t _, tree_end;
std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
predictor->PredictInstance(inst, out_preds, model_, tree_end);
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned layer_begin, unsigned layer_end, bool approximate, int,
unsigned) override {
CHECK(configured_);
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
approximate);
}
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
unsigned layer_begin, unsigned layer_end, bool approximate) override {
CHECK(configured_);
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
&weight_drop_, approximate);

View File

@@ -1,26 +1,24 @@
/*!
* Copyright 2021 by Contributors
/**
* Copyright 2021-2023, XGBoost Contributors
*/
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include "../common/device_helpers.cuh"
#include "xgboost/context.h"
#include "xgboost/linalg.h"
#include "xgboost/span.h"
#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh" // for MakeTransformIterator
#include "xgboost/base.h" // for GradientPair
#include "xgboost/linalg.h" // for Matrix
namespace xgboost {
namespace gbm {
void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
bst_group_t n_groups, bst_group_t group_id,
HostDeviceVector<GradientPair> *out_gpair) {
auto mat = linalg::TensorView<GradientPair const, 2>(
in_gpair->ConstDeviceSpan(),
{in_gpair->Size() / n_groups, static_cast<size_t>(n_groups)},
in_gpair->DeviceIdx());
auto v_in = mat.Slice(linalg::All(), group_id);
out_gpair->Resize(v_in.Size());
auto d_out = out_gpair->DeviceSpan();
dh::LaunchN(v_in.Size(), [=] __device__(size_t i) { d_out[i] = v_in(i); });
namespace xgboost::gbm {
void GPUCopyGradient(Context const *ctx, linalg::Matrix<GradientPair> const *in_gpair,
bst_group_t group_id, linalg::Matrix<GradientPair> *out_gpair) {
auto v_in = in_gpair->View(ctx->Device()).Slice(linalg::All(), group_id);
out_gpair->SetDevice(ctx->Device());
out_gpair->Reshape(v_in.Size(), 1);
auto d_out = out_gpair->View(ctx->Device());
auto cuctx = ctx->CUDACtx();
auto it = dh::MakeTransformIterator<GradientPair>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v_in(i); });
thrust::copy(cuctx->CTP(), it, it + v_in.Size(), d_out.Values().data());
}
void GPUDartPredictInc(common::Span<float> out_predts,
@@ -42,5 +40,4 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
});
}
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

View File

@@ -43,37 +43,23 @@ enum class TreeProcessType : int {
kDefault = 0,
kUpdate = 1
};
enum class PredictorType : int {
kAuto = 0,
kCPUPredictor,
kGPUPredictor,
kOneAPIPredictor
};
} // namespace xgboost
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
namespace xgboost {
namespace gbm {
namespace xgboost::gbm {
/*! \brief training parameters */
struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
/*! \brief tree updater sequence */
std::string updater_seq;
/*! \brief type of boosting process to run */
TreeProcessType process_type;
// predictor type
PredictorType predictor;
// tree construction method
TreeMethod tree_method;
// declare parameters
DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
DMLC_DECLARE_FIELD(updater_seq)
.set_default("grow_colmaker,prune")
.describe("Tree updater sequence.");
DMLC_DECLARE_FIELD(updater_seq).describe("Tree updater sequence.").set_default("");
DMLC_DECLARE_FIELD(process_type)
.set_default(TreeProcessType::kDefault)
.add_enum("default", TreeProcessType::kDefault)
@@ -81,13 +67,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
.describe("Whether to run the normal boosting process that creates new trees,"\
" or to update the trees in an existing model.");
DMLC_DECLARE_ALIAS(updater_seq, updater);
DMLC_DECLARE_FIELD(predictor)
.set_default(PredictorType::kAuto)
.add_enum("auto", PredictorType::kAuto)
.add_enum("cpu_predictor", PredictorType::kCPUPredictor)
.add_enum("gpu_predictor", PredictorType::kGPUPredictor)
.add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
.describe("Predictor algorithm type");
DMLC_DECLARE_FIELD(tree_method)
.set_default(TreeMethod::kAuto)
.add_enum("auto", TreeMethod::kAuto)
@@ -189,44 +168,29 @@ bool SliceTrees(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GBTreeMode
class GBTree : public GradientBooster {
public:
explicit GBTree(LearnerModelParam const* booster_config, Context const* ctx)
: GradientBooster{ctx}, model_(booster_config, ctx_) {}
void Configure(const Args& cfg) override;
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix, only useful when tree_method is auto.
void PerformTreeMethodHeuristic(DMatrix* fmat);
/*! \brief Map `tree_method` parameter to `updater` parameter */
void ConfigureUpdaters();
void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
: GradientBooster{ctx}, model_(booster_config, ctx_) {
monitor_.Init(__func__);
}
void Configure(Args const& cfg) override;
/**
* \brief Optionally update the leaf value.
* @brief Optionally update the leaf value.
*/
void UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
ObjFunction const* obj,
std::int32_t group_idx,
ObjFunction const* obj, std::int32_t group_idx,
std::vector<HostDeviceVector<bst_node_t>> const& node_position,
std::vector<std::unique_ptr<RegTree>>* p_trees);
/**
* @brief Carry out one iteration of boosting.
*/
void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
ObjFunction const* obj) override;
/*! \brief Carry out one iteration of boosting */
void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) override;
[[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
bool UseGPU() const override {
return
tparam_.predictor == PredictorType::kGPUPredictor ||
tparam_.tree_method == TreeMethod::kGPUHist;
}
GBTreeTrainParam const& GetTrainParam() const {
return tparam_;
}
void Load(dmlc::Stream* fi) override {
model_.Load(fi);
this->cfg_.clear();
}
[[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
void Load(dmlc::Stream* fi) override { model_.Load(fi); }
void Save(dmlc::Stream* fo) const override {
model_.Save(fo);
}
@@ -246,39 +210,14 @@ class GBTree : public GradientBooster {
return !model_.trees.empty() || !model_.trees_to_update.empty();
}
void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) const;
void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
bst_layer_t layer_begin, bst_layer_t layer_end) override;
void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
bst_layer_t layer_begin, bst_layer_t layer_end) const override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
std::vector<Predictor const *> predictors{
cpu_predictor_.get(),
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
};
StringView msg{"Unsupported data type for inplace predict."};
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
for (auto const &p : predictors) {
if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
return;
}
}
LOG(FATAL) << msg;
} else {
bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
tree_begin, tree_end);
CHECK(success) << msg << std::endl
<< "Current Predictor: "
<< (tparam_.predictor == PredictorType::kCPUPredictor
? "cpu_predictor"
: "gpu_predictor");
}
}
bst_layer_t layer_begin, bst_layer_t layer_end) const override;
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
@@ -347,7 +286,6 @@ class GBTree : public GradientBooster {
void PredictInstance(const SparsePage::Inst& inst, std::vector<bst_float>* out_preds,
uint32_t layer_begin, uint32_t layer_end) override {
CHECK(configured_);
std::uint32_t _, tree_end;
std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
cpu_predictor_->PredictInstance(inst, out_preds, model_, tree_end);
@@ -359,32 +297,27 @@ class GBTree : public GradientBooster {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
"n_iteration), use model slicing instead.";
this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
uint32_t layer_begin, uint32_t layer_end, bool approximate,
int, unsigned) override {
CHECK(configured_);
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictContribution(
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
approximate);
}
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
CHECK(configured_);
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict interaction contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictInteractionContributions(
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
tree_end, nullptr, approximate);
}
[[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -393,15 +326,13 @@ class GBTree : public GradientBooster {
}
protected:
// initialize updater before using them
void InitUpdater(Args const& cfg);
void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
std::vector<HostDeviceVector<bst_node_t>>* out_position,
std::vector<std::unique_ptr<RegTree>>* ret);
std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
DMatrix* f_dmat = nullptr) const;
[[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
DMatrix* f_dmat = nullptr) const;
// commit new trees all at once
virtual void CommitModel(TreesOneIter&& new_trees);
@@ -412,26 +343,18 @@ class GBTree : public GradientBooster {
GBTreeTrainParam tparam_;
// Tree training parameter
tree::TrainParam tree_param_;
// ----training fields----
bool showed_updater_warning_ {false};
bool specified_updater_ {false};
bool configured_ {false};
// configurations for tree
Args cfg_;
// the updaters that can be applied to each of tree
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
// Predictors
std::unique_ptr<Predictor> cpu_predictor_;
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
std::unique_ptr<Predictor> gpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
std::unique_ptr<Predictor> gpu_predictor_{nullptr};
#if defined(XGBOOST_USE_ONEAPI)
std::unique_ptr<Predictor> oneapi_predictor_;
#endif // defined(XGBOOST_USE_ONEAPI)
common::Monitor monitor_;
};
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm
#endif // XGBOOST_GBM_GBTREE_H_

View File

@@ -40,6 +40,7 @@
#include "common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "common/charconv.h" // for to_chars, to_chars_result, NumericLimits, from_...
#include "common/common.h" // for ToString, Split
#include "common/error_msg.h" // for MaxFeatureSize, WarnOldSerialization, ...
#include "common/io.h" // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
#include "common/observer.h" // for TrainingObserver
#include "common/random.h" // for GlobalRandom
@@ -278,15 +279,15 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
// Make sure read access everywhere for thread-safe prediction.
std::as_const(base_score_).HostView();
if (!ctx->IsCPU()) {
std::as_const(base_score_).View(ctx->gpu_id);
std::as_const(base_score_).View(ctx->Device());
}
CHECK(std::as_const(base_score_).Data()->HostCanRead());
}
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
// multi-class is not yet supported.
CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
if (device == Context::kCpuId) {
if (device.IsCPU()) {
// Make sure that we won't run into race condition.
CHECK(base_score_.Data()->HostCanRead());
return base_score_.HostView();
@@ -299,7 +300,7 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device)
}
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* ctx) const {
return this->BaseScore(ctx->gpu_id);
return this->BaseScore(ctx->Device());
}
void LearnerModelParam::Copy(LearnerModelParam const& that) {
@@ -308,7 +309,7 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
base_score_.Data()->Copy(*that.base_score_.Data());
std::as_const(base_score_).HostView();
if (that.base_score_.DeviceIdx() != Context::kCpuId) {
std::as_const(base_score_).View(that.base_score_.DeviceIdx());
std::as_const(base_score_).View(that.base_score_.Device());
}
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
CHECK(base_score_.Data()->HostCanRead());
@@ -356,21 +357,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
using LearnerAPIThreadLocalStore =
dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
namespace {
StringView ModelMsg() {
return StringView{
R"doc(
If you are loading a serialized model (like pickle in Python, RDS in R) generated by
older XGBoost, please export the model by calling `Booster.save_model` from that version
first, then load it back in current version. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
for more details about differences between saving model and serializing.
)doc"};
}
} // anonymous namespace
class LearnerConfiguration : public Learner {
private:
std::mutex config_lock_;
@@ -402,7 +388,7 @@ class LearnerConfiguration : public Learner {
this->ConfigureTargets();
auto task = UsePtr(obj_)->Task();
linalg::Tensor<float, 1> base_score({1}, Ctx()->gpu_id);
linalg::Tensor<float, 1> base_score({1}, Ctx()->Device());
auto h_base_score = base_score.HostView();
// transform to margin
@@ -438,7 +424,7 @@ class LearnerConfiguration : public Learner {
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
if (p_fmat) {
auto const& info = p_fmat->Info();
info.Validate(Ctx()->gpu_id);
info.Validate(Ctx()->Ordinal());
// We estimate it from input data.
linalg::Tensor<float, 1> base_score;
InitEstimation(info, &base_score);
@@ -530,7 +516,7 @@ class LearnerConfiguration : public Learner {
}
if (!Version::Same(origin_version)) {
LOG(WARNING) << ModelMsg();
error::WarnOldSerialization();
return; // skip configuration if version is not matched
}
@@ -561,7 +547,7 @@ class LearnerConfiguration : public Learner {
for (size_t i = 0; i < n_metrics; ++i) {
auto old_serialization = IsA<String>(j_metrics[i]);
if (old_serialization) {
LOG(WARNING) << ModelMsg();
error::WarnOldSerialization();
metric_names_[i] = get<String>(j_metrics[i]);
} else {
metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -597,8 +583,9 @@ class LearnerConfiguration : public Learner {
auto& objective_fn = learner_parameters["objective"];
obj_->SaveConfig(&objective_fn);
std::vector<Json> metrics(metrics_.size(), Json{Object{}});
std::vector<Json> metrics(metrics_.size());
for (size_t i = 0; i < metrics_.size(); ++i) {
metrics[i] = Object{};
metrics_[i]->SaveConfig(&metrics[i]);
}
learner_parameters["metrics"] = Array(std::move(metrics));
@@ -704,19 +691,20 @@ class LearnerConfiguration : public Learner {
stack.pop();
auto const &obj = get<Object const>(j_obj);
for (auto const &kv : obj) {
for (auto const& kv : obj) {
if (is_parameter(kv.first)) {
auto parameter = get<Object const>(kv.second);
std::transform(parameter.begin(), parameter.end(), std::back_inserter(keys),
[](std::pair<std::string const&, Json const&> const& kv) {
return kv.first;
});
std::transform(
parameter.begin(), parameter.end(), std::back_inserter(keys),
[](std::pair<std::string const&, Json const&> const& kv) { return kv.first; });
} else if (IsA<Object>(kv.second)) {
stack.push(kv.second);
} else if (kv.first == "metrics") {
} else if (IsA<Array>(kv.second)) {
auto const& array = get<Array const>(kv.second);
for (auto const& v : array) {
stack.push(v);
if (IsA<Object>(v) || IsA<Array>(v)) {
stack.push(v);
}
}
}
}
@@ -725,6 +713,7 @@ class LearnerConfiguration : public Learner {
// FIXME(trivialfis): Make eval_metric a training parameter.
keys.emplace_back(kEvalMetric);
keys.emplace_back("num_output_group");
keys.emplace_back("gpu_id"); // deprecated param.
std::sort(keys.begin(), keys.end());
@@ -763,9 +752,7 @@ class LearnerConfiguration : public Learner {
CHECK(matrix.first.ptr);
CHECK(!matrix.second.ref.expired());
const uint64_t num_col = matrix.first.ptr->Info().num_col_;
CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
<< "Unfortunately, XGBoost does not support data matrices with "
<< std::numeric_limits<unsigned>::max() << " features or greater";
error::MaxFeatureSize(num_col);
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
}
@@ -810,7 +797,7 @@ class LearnerConfiguration : public Learner {
bool has_nc {cfg_.find("num_class") != cfg_.cend()};
// Inject num_class into configuration.
// FIXME(jiamingy): Remove the duplicated parameter in softmax
cfg_["num_class"] = common::ToString(mparam_.num_class);
cfg_["num_class"] = std::to_string(mparam_.num_class);
auto& args = *p_args;
args = {cfg_.cbegin(), cfg_.cend()}; // renew
obj_->Configure(args);
@@ -821,14 +808,13 @@ class LearnerConfiguration : public Learner {
void ConfigureMetrics(Args const& args) {
for (auto const& name : metric_names_) {
auto DupCheck = [&name](std::unique_ptr<Metric> const& m) {
return m->Name() != name;
};
auto DupCheck = [&name](std::unique_ptr<Metric> const& m) { return m->Name() != name; };
if (std::all_of(metrics_.begin(), metrics_.end(), DupCheck)) {
metrics_.emplace_back(std::unique_ptr<Metric>(Metric::Create(name, &ctx_)));
mparam_.contain_eval_metrics = 1;
}
}
for (auto& p_metric : metrics_) {
p_metric->Configure(args);
}
@@ -862,8 +848,7 @@ class LearnerConfiguration : public Learner {
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
#ifndef XGBOOST_USE_HIP
base_score->Reshape(1);
collective::ApplyWithLabels(info, base_score->Data()->HostPointer(),
sizeof(bst_float) * base_score->Size(),
collective::ApplyWithLabels(info, base_score->Data(),
[&] { UsePtr(obj_)->InitEstimation(info, base_score); });
#else
if (info.IsVerticalFederated()) {
@@ -1101,7 +1086,7 @@ class LearnerIO : public LearnerConfiguration {
mparam_.major_version = std::get<0>(Version::Self());
mparam_.minor_version = std::get<1>(Version::Self());
cfg_["num_feature"] = common::ToString(mparam_.num_feature);
cfg_["num_feature"] = std::to_string(mparam_.num_feature);
auto n = tparam_.__DICT__();
cfg_.insert(n.cbegin(), n.cend());
@@ -1185,7 +1170,7 @@ class LearnerIO : public LearnerConfiguration {
Json memory_snapshot;
if (header[1] == '"') {
memory_snapshot = Json::Load(StringView{buffer});
LOG(WARNING) << ModelMsg();
error::WarnOldSerialization();
} else if (std::isalpha(header[1])) {
memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
} else {
@@ -1204,7 +1189,7 @@ class LearnerIO : public LearnerConfiguration {
header.resize(serialisation_header_.size());
CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
// Avoid printing the content in loaded header, which might be random binary code.
CHECK(header == serialisation_header_) << ModelMsg();
CHECK(header == serialisation_header_) << error::OldSerialization();
int64_t sz {-1};
CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -1307,14 +1292,14 @@ class LearnerImpl : public LearnerIO {
monitor_.Start("GetGradient");
GetGradient(predt.predictions, train->Info(), iter, &gpair_);
monitor_.Stop("GetGradient");
TrainingObserver::Instance().Observe(gpair_, "Gradients");
TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
monitor_.Stop("UpdateOneIter");
}
void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
HostDeviceVector<GradientPair>* in_gpair) override {
linalg::Matrix<GradientPair>* in_gpair) override {
monitor_.Start("BoostOneIter");
this->Configure();
@@ -1324,6 +1309,9 @@ class LearnerImpl : public LearnerIO {
this->ValidateDMatrix(train.get(), true);
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
<< "The number of columns in gradient should be equal to the number of targets/classes in "
"the model.";
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
monitor_.Stop("BoostOneIter");
@@ -1367,10 +1355,9 @@ class LearnerImpl : public LearnerIO {
}
void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
unsigned layer_end, bool training,
bool pred_leaf, bool pred_contribs, bool approx_contribs,
bool pred_interactions) override {
HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
bool approx_contribs, bool pred_interactions) override {
int multiple_predictions = static_cast<int>(pred_leaf) +
static_cast<int>(pred_interactions) +
static_cast<int>(pred_contribs);
@@ -1392,7 +1379,7 @@ class LearnerImpl : public LearnerIO {
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
// Copy the prediction cache to output prediction. out_preds comes from C API
out_preds->SetDevice(ctx_.gpu_id);
out_preds->SetDevice(ctx_.Device());
out_preds->Resize(prediction.predictions.Size());
out_preds->Copy(prediction.predictions);
if (!output_margin) {
@@ -1418,13 +1405,16 @@ class LearnerImpl : public LearnerIO {
}
void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
uint32_t iteration_end) override {
HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
bst_layer_t iteration_end) override {
this->Configure();
this->CheckModelInitialized();
auto& out_predictions = this->GetThreadLocal().prediction_entry;
out_predictions.Reset();
this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
if (type == PredictionType::kValue) {
obj_->PredTransform(&out_predictions.predictions);
} else if (type == PredictionType::kMargin) {
@@ -1479,26 +1469,25 @@ class LearnerImpl : public LearnerIO {
}
if (p_fmat->Info().num_row_ == 0) {
LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
error::WarnEmptyDataset();
}
}
private:
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
HostDeviceVector<GradientPair>* out_gpair) {
#ifndef XGBOOST_USE_HIP
out_gpair->Resize(preds.Size());
collective::ApplyWithLabels(info, out_gpair->HostPointer(),
out_gpair->Size() * sizeof(GradientPair),
[&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
#else
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
#if defined(XGBOOST_USE_CUDA)
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(),
[&] { obj_->GetGradient(preds, info, iter, out_gpair); });
#elif defined(XGBOOST_USE_HIP)
if (info.IsVerticalFederated()) {
out_gpair->Resize(preds.Size());
collective::ApplyWithLabels(info, out_gpair->HostPointer(),
out_gpair->Size() * sizeof(GradientPair),
[&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
} else {
obj_->GetGradient(preds, info, iteration, out_gpair);
out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
collective::ApplyWithLabels(info, out_gpair->Data(),
[&] { obj_->GetGradient(preds, info, iter, out_gpair); });
}
else {
obj_->GetGradient(preds, info, iter, out_gpair);
}
#endif
}
@@ -1506,7 +1495,7 @@ class LearnerImpl : public LearnerIO {
/*! \brief random number transformation seed. */
static int32_t constexpr kRandSeedMagic = 127;
// gradient pairs
HostDeviceVector<GradientPair> gpair_;
linalg::Matrix<GradientPair> gpair_;
/*! \brief Temporary storage to prediction. Useful for storing data transformed by
* objective function */
PredictionContainer output_predictions_;

View File

@@ -9,8 +9,7 @@
#include "coordinate_common.h"
#include "xgboost/json.h"
namespace xgboost {
namespace linear {
namespace xgboost::linear {
DMLC_REGISTER_PARAMETER(CoordinateParam);
DMLC_REGISTRY_FILE_TAG(updater_coordinate);
@@ -39,36 +38,38 @@ class CoordinateUpdater : public LinearUpdater {
FromJson(config.at("linear_train_param"), &tparam_);
FromJson(config.at("coordinate_param"), &cparam_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
void SaveConfig(Json *p_out) const override {
LOG(DEBUG) << "Save config for CPU updater.";
auto &out = *p_out;
out["linear_train_param"] = ToJson(tparam_);
out["coordinate_param"] = ToJson(cparam_);
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
double sum_instance_weight) override {
auto gpair = in_gpair->Data();
tparam_.DenormalizePenalties(sum_instance_weight);
const int ngroup = model->learner_model_param->num_output_group;
// update bias
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->ConstHostVector(), p_fmat,
auto grad = GetBiasGradientParallel(group_idx, ngroup, gpair->ConstHostVector(), p_fmat,
ctx_->Threads());
auto dbias = static_cast<float>(tparam_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[group_idx] += dbias;
UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &gpair->HostVector(), p_fmat);
}
// prepare for updating the weights
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, cparam_.top_k);
// update weights
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
int fidx =
selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
selector_->NextFeature(ctx_, i, *model, group_idx, gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
this->UpdateFeature(fidx, group_idx, &gpair->HostVector(), p_fmat, model);
}
}
monitor_.Stop("UpdateFeature");
@@ -99,5 +100,4 @@ class CoordinateUpdater : public LinearUpdater {
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
.describe("Update linear model according to coordinate descent algorithm.")
.set_body([]() { return new CoordinateUpdater(); });
} // namespace linear
} // namespace xgboost
} // namespace xgboost::linear

View File

@@ -15,8 +15,7 @@
#include "../common/timer.h"
#include "./param.h"
namespace xgboost {
namespace linear {
namespace xgboost::linear {
DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
@@ -29,7 +28,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
public:
// set training parameter
void Configure(Args const& args) override {
void Configure(Args const &args) override {
tparam_.UpdateAllowUnknown(args);
coord_param_.UpdateAllowUnknown(args);
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
@@ -41,8 +40,9 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
FromJson(config.at("linear_train_param"), &tparam_);
FromJson(config.at("coordinate_param"), &coord_param_);
}
void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
void SaveConfig(Json *p_out) const override {
LOG(DEBUG) << "Save config for GPU updater.";
auto &out = *p_out;
out["linear_train_param"] = ToJson(tparam_);
out["coordinate_param"] = ToJson(coord_param_);
}
@@ -106,18 +106,18 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
}
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
double sum_instance_weight) override {
tparam_.DenormalizePenalties(sum_instance_weight);
monitor_.Start("LazyInitDevice");
this->LazyInitDevice(p_fmat, *(model->learner_model_param));
monitor_.Stop("LazyInitDevice");
monitor_.Start("UpdateGpair");
auto &in_gpair_host = in_gpair->ConstHostVector();
// Update gpair
if (ctx_->gpu_id >= 0) {
this->UpdateGpair(in_gpair_host);
if (ctx_->IsCUDA()) {
this->UpdateGpair(in_gpair->Data()->ConstHostVector());
}
monitor_.Stop("UpdateGpair");
@@ -125,15 +125,15 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
this->UpdateBias(model);
monitor_.Stop("UpdateBias");
// prepare for updating the weights
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, coord_param_.top_k);
selector_->Setup(ctx_, *model, in_gpair->Data()->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, coord_param_.top_k);
monitor_.Start("UpdateFeature");
for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
++group_idx) {
for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
auto fidx =
selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->Data()->ConstHostVector(),
p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, model);
}
@@ -279,5 +279,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
"Update linear model according to coordinate descent algorithm. GPU "
"accelerated.")
.set_body([]() { return new GPUCoordinateUpdater(); });
} // namespace linear
} // namespace xgboost
} // namespace xgboost::linear

View File

@@ -6,8 +6,7 @@
#include <xgboost/linear_updater.h>
#include "coordinate_common.h"
namespace xgboost {
namespace linear {
namespace xgboost::linear {
DMLC_REGISTRY_FILE_TAG(updater_shotgun);
@@ -32,30 +31,31 @@ class ShotgunUpdater : public LinearUpdater {
out["linear_train_param"] = ToJson(param_);
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
auto &gpair = in_gpair->HostVector();
void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
double sum_instance_weight) override {
auto gpair = in_gpair->Data();
param_.DenormalizePenalties(sum_instance_weight);
const int ngroup = model->learner_model_param->num_output_group;
// update bias
for (int gid = 0; gid < ngroup; ++gid) {
auto grad = GetBiasGradientParallel(gid, ngroup, in_gpair->ConstHostVector(), p_fmat,
auto grad = GetBiasGradientParallel(gid, ngroup, gpair->ConstHostVector(), p_fmat,
ctx_->Threads());
auto dbias = static_cast<bst_float>(param_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->Bias()[gid] += dbias;
UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &gpair->HostVector(), p_fmat);
}
// lock-free parallel updates of weights
selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
param_.reg_lambda_denorm, 0);
auto &h_gpair = gpair->HostVector();
for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
auto page = batch.GetView();
const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
int ii = selector_->NextFeature(ctx_, i, *model, 0, gpair->ConstHostVector(), p_fmat,
param_.reg_alpha_denorm, param_.reg_lambda_denorm);
if (ii < 0) return;
const bst_uint fid = ii;
@@ -63,7 +63,7 @@ class ShotgunUpdater : public LinearUpdater {
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (auto &c : col) {
const GradientPair &p = gpair[c.index * ngroup + gid];
const GradientPair &p = h_gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
const bst_float v = c.fvalue;
sum_grad += p.GetGrad() * v;
@@ -77,7 +77,7 @@ class ShotgunUpdater : public LinearUpdater {
w += dw;
// update grad values
for (auto &c : col) {
GradientPair &p = gpair[c.index * ngroup + gid];
GradientPair &p = h_gpair[c.index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
}
@@ -98,5 +98,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
"Update linear model according to shotgun coordinate descent "
"algorithm.")
.set_body([]() { return new ShotgunUpdater(); });
} // namespace linear
} // namespace xgboost
} // namespace xgboost::linear

View File

@@ -82,22 +82,19 @@ template <typename BinaryAUC>
double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
CHECK_NE(n_classes, 0);
auto const labels = info.labels.View(Context::kCpuId);
auto const labels = info.labels.HostView();
if (labels.Shape(0) != 0) {
CHECK_EQ(labels.Shape(1), 1) << "AUC doesn't support multi-target model.";
}
std::vector<double> results_storage(n_classes * 3, 0);
linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
Context::kCpuId);
auto results = linalg::MakeTensorView(ctx, results_storage, n_classes, 3);
auto local_area = results.Slice(linalg::All(), 0);
auto tp = results.Slice(linalg::All(), 1);
auto auc = results.Slice(linalg::All(), 2);
auto weights = common::OptionalWeights{info.weights_.ConstHostSpan()};
auto predts_t = linalg::TensorView<float const, 2>(
predts, {static_cast<size_t>(info.num_row_), n_classes},
Context::kCpuId);
auto predts_t = linalg::MakeTensorView(ctx, predts, info.num_row_, n_classes);
if (info.labels.Size() != 0) {
common::ParallelFor(n_classes, n_threads, [&](auto c) {
@@ -108,8 +105,8 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
response[i] = labels(i) == c ? 1.0f : 0.0;
}
double fp;
std::tie(fp, tp(c), auc(c)) =
binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
std::tie(fp, tp(c), auc(c)) = binary_auc(
ctx, proba, linalg::MakeVec(response.data(), response.size(), ctx->Device()), weights);
local_area(c) = fp * tp(c);
});
}
@@ -220,7 +217,7 @@ std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> co
CHECK_GE(info.group_ptr_.size(), 2);
uint32_t n_groups = info.group_ptr_.size() - 1;
auto s_predts = common::Span<float const>{predts};
auto labels = info.labels.View(Context::kCpuId);
auto labels = info.labels.View(ctx->Device());
auto s_weights = info.weights_.ConstHostSpan();
std::atomic<uint32_t> invalid_groups{0};
@@ -363,8 +360,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
ctx_->gpu_id, &this->d_cache_);
std::tie(fp, tp, auc) =
GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
}
return std::make_tuple(fp, tp, auc);
}
@@ -381,8 +378,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
std::int32_t,
std::shared_ptr<DeviceAUCCache> *) {
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
common::AssertGPUSupport();
return {};
}
@@ -414,8 +410,8 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
} else {
std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
ctx_->gpu_id, &this->d_cache_);
std::tie(pr, re, auc) =
GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
}
return std::make_tuple(pr, re, auc);
}
@@ -459,7 +455,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
common::AssertGPUSupport();
return {};
}

View File

@@ -91,15 +91,14 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
template <typename Fn>
std::tuple<double, double, double>
GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
int32_t device, common::Span<size_t const> d_sorted_idx,
DeviceOrd device, common::Span<size_t const> d_sorted_idx,
Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
auto labels = info.labels.View(device);
auto weights = info.weights_.ConstDeviceSpan();
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
CHECK_NE(labels.Size(), 0);
@@ -194,7 +193,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
}
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> *p_cache) {
auto &cache = *p_cache;
InitCacheOnce<false>(predts, p_cache);
@@ -350,14 +349,14 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
* up each class in all kernels.
*/
template <bool scale, typename Fn>
double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
common::Span<uint32_t> d_class_ptr, size_t n_classes,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device.ordinal));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device.ordinal));
#endif
/**
* Sorted idx
*/
@@ -528,11 +527,12 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache);
auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev,
double tp, size_t /*class_id*/) {
auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp,
size_t /*class_id*/) {
return TrapezoidArea(fp_prev, fp, tp_prev, tp);
};
return GPUMultiClassAUCOVR<true>(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn);
return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
fn);
}
namespace {
@@ -581,7 +581,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
/**
* Sort the labels
*/
auto d_labels = info.labels.View(ctx->gpu_id);
auto d_labels = info.labels.View(ctx->Device());
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
common::SegmentedArgSort<false, false>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
@@ -679,7 +679,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
}
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> *p_cache) {
auto& cache = *p_cache;
InitCacheOnce<false>(predts, p_cache);
@@ -744,7 +744,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
/**
* Get total positive/negative
*/
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
auto n_samples = info.num_row_;
dh::caching_device_vector<Pair> totals(n_classes);
auto key_it =
@@ -785,13 +785,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
d_totals[class_id].first);
};
return GPUMultiClassAUCOVR<false>(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn);
return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
}
template <typename Fn>
std::pair<double, uint32_t>
GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
common::Span<uint32_t> d_group_ptr, int32_t device,
common::Span<uint32_t> d_group_ptr, DeviceOrd device,
std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
/**
* Sorted idx
@@ -960,7 +960,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::SegmentedArgSort<false, false>(ctx, predts, d_group_ptr, d_sorted_idx);
dh::XGBDeviceAllocator<char> alloc;
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
#if defined(XGBOOST_USE_HIP)
if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
@@ -1016,7 +1016,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
d_totals[group_id].first);
};
return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn);
return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
}
} // namespace metric
} // namespace xgboost

View File

@@ -30,7 +30,7 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
struct DeviceAUCCache;
std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd,
std::shared_ptr<DeviceAUCCache> *p_cache);
double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -45,7 +45,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
* PR AUC *
**********/
std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
MetaInfo const &info, std::int32_t device,
MetaInfo const &info, DeviceOrd,
std::shared_ptr<DeviceAUCCache> *p_cache);
double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,

View File

@@ -45,7 +45,7 @@ namespace {
template <typename Fn>
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
PackedReduceResult result;
auto labels = info.labels.View(ctx->gpu_id);
auto labels = info.labels.View(ctx->Device());
if (ctx->IsCPU()) {
auto n_threads = ctx->Threads();
std::vector<double> score_tloc(n_threads, 0.0);
@@ -199,10 +199,10 @@ class PseudoErrorLoss : public MetricNoCache {
double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK_EQ(info.labels.Shape(0), info.num_row_);
auto labels = info.labels.View(ctx_->gpu_id);
preds.SetDevice(ctx_->gpu_id);
auto labels = info.labels.View(ctx_->Device());
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
info.weights_.SetDevice(ctx_->gpu_id);
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
float slope = this->param_.huber_slope;
@@ -365,11 +365,11 @@ struct EvalEWiseBase : public MetricNoCache {
if (info.labels.Size() != 0) {
CHECK_NE(info.labels.Shape(1), 0);
}
auto labels = info.labels.View(ctx_->gpu_id);
info.weights_.SetDevice(ctx_->gpu_id);
auto labels = info.labels.View(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
preds.SetDevice(ctx_->gpu_id);
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
auto d_policy = policy_;
@@ -460,16 +460,16 @@ class QuantileError : public MetricNoCache {
}
auto const* ctx = ctx_;
auto y_true = info.labels.View(ctx->gpu_id);
preds.SetDevice(ctx->gpu_id);
alpha_.SetDevice(ctx->gpu_id);
auto y_true = info.labels.View(ctx->Device());
preds.SetDevice(ctx->Device());
alpha_.SetDevice(ctx->Device());
auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
CHECK_NE(n_targets, 0);
auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
alpha_.Size(), n_targets);
info.weights_.SetDevice(ctx->gpu_id);
info.weights_.SetDevice(ctx->Device());
common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

View File

@@ -68,13 +68,14 @@ struct EvalAMS : public MetricNoCache {
const auto &h_preds = preds.ConstHostVector();
common::ParallelFor(ndata, ctx_->Threads(),
[&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
common::Sort(ctx_, rec.begin(), rec.end(),
[](auto const& l, auto const& r) { return l.first > r.first; });
auto ntop = static_cast<unsigned>(ratio_ * ndata);
if (ntop == 0) ntop = ndata;
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
const auto& labels = info.labels.View(Context::kCpuId);
const auto& labels = info.labels.View(DeviceOrd::CPU());
for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
const unsigned ridx = rec[i].second;
const bst_float wt = info.GetWeight(ridx);
@@ -133,7 +134,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
std::vector<double> sum_tloc(ctx_->Threads(), 0.0);
{
const auto& labels = info.labels.View(Context::kCpuId);
const auto& labels = info.labels.HostView();
const auto &h_preds = preds.ConstHostVector();
dmlc::OMPException exc;

View File

@@ -39,7 +39,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt,
std::shared_ptr<ltr::PreCache> p_cache) {
auto d_gptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
@@ -95,7 +95,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
if (!d_weight.Empty()) {
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
}
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
@@ -125,9 +125,9 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
HostDeviceVector<float> const &predt, bool minus,
std::shared_ptr<ltr::MAPCache> p_cache) {
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
auto key_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),

View File

@@ -11,6 +11,7 @@
#include <hipcub/hipcub.hpp> // NOLINT
#endif
#include "../collective/aggregator.h"
#include "../common/cuda_context.cuh" // CUDAContext
#include "../common/device_helpers.cuh"
#include "../common/stats.cuh"
@@ -30,11 +31,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
// copy position to buffer
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
auto cuctx = ctx->CUDACtx();
size_t n_samples = position.size();
dh::device_vector<bst_node_t> sorted_position(position.size());
@@ -115,11 +115,11 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
*/
auto& nidx = *p_nidx;
auto& nptr = *p_nptr;
nidx.SetDevice(ctx->gpu_id);
nidx.SetDevice(ctx->Device());
nidx.Resize(n_leaf);
auto d_node_idx = nidx.DeviceSpan();
nptr.SetDevice(ctx->gpu_id);
nptr.SetDevice(ctx->Device());
nptr.Resize(n_leaf + 1, 0);
auto d_node_ptr = nptr.DeviceSpan();
@@ -172,11 +172,10 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
#endif
dh::device_vector<size_t> ridx;
HostDeviceVector<size_t> nptr;
HostDeviceVector<bst_node_t> nidx;
@@ -188,38 +187,39 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
}
HostDeviceVector<float> quantiles;
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
predt.Size() / info.num_row_);
CHECK_LT(group_idx, d_predt.Shape(1));
auto t_predt = d_predt.Slice(linalg::All(), group_idx);
auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
auto d_row_index = dh::ToSpan(ridx);
auto seg_beg = nptr.DevicePointer();
auto seg_end = seg_beg + nptr.Size();
auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(size_t i) {
float p = t_predt(d_row_index[i]);
auto y = d_labels(d_row_index[i]);
return y - p;
});
CHECK_EQ(d_labels.Shape(0), position.size());
auto val_end = val_beg + d_labels.Shape(0);
CHECK_EQ(nidx.Size() + 1, nptr.Size());
if (info.weights_.Empty()) {
common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
} else {
info.weights_.SetDevice(ctx->gpu_id);
auto d_weights = info.weights_.ConstDeviceSpan();
CHECK_EQ(d_weights.size(), d_row_index.size());
auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
w_it + d_weights.size(), &quantiles);
}
HostDeviceVector<float> quantiles;
collective::ApplyWithLabels(info, &quantiles, [&] {
auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
auto d_row_index = dh::ToSpan(ridx);
auto seg_beg = nptr.DevicePointer();
auto seg_end = seg_beg + nptr.Size();
auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(size_t i) {
float p = t_predt(d_row_index[i]);
auto y = d_labels(d_row_index[i]);
return y - p;
});
CHECK_EQ(d_labels.Shape(0), position.size());
auto val_end = val_beg + d_labels.Shape(0);
CHECK_EQ(nidx.Size() + 1, nptr.Size());
if (info.weights_.Empty()) {
common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
} else {
info.weights_.SetDevice(ctx->Device());
auto d_weights = info.weights_.ConstDeviceSpan();
CHECK_EQ(d_weights.size(), d_row_index.size());
auto w_it =
thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
w_it + d_weights.size(), &quantiles);
}
});
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
}
} // namespace detail

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019-2022 by Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
* \file aft_obj.cu
* \brief Definition of AFT loss for survival analysis.
* \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -41,11 +41,9 @@ class AFTObj : public ObjFunction {
ObjInfo Task() const override { return ObjInfo::kSurvival; }
template <typename Distribution>
void GetGradientImpl(const HostDeviceVector<bst_float> &preds,
const MetaInfo &info,
HostDeviceVector<GradientPair> *out_gpair,
size_t ndata, int device, bool is_null_weight,
float aft_loss_distribution_scale) {
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
bool is_null_weight, float aft_loss_distribution_scale) {
common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx,
common::Span<GradientPair> _out_gpair,
@@ -66,16 +64,17 @@ class AFTObj : public ObjFunction {
_out_gpair[_idx] = GradientPair(grad * w, hess * w);
},
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
out_gpair, &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
out_gpair->Data(), &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
&info.weights_);
}
void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int /*iter*/,
HostDeviceVector<GradientPair>* out_gpair) override {
linalg::Matrix<GradientPair>* out_gpair) override {
const size_t ndata = preds.Size();
CHECK_EQ(info.labels_lower_bound_.Size(), ndata);
CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
out_gpair->Resize(ndata);
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(ndata, 1);
const int device = ctx_->gpu_id;
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
const bool is_null_weight = info.weights_.Size() == 0;

View File

@@ -27,8 +27,8 @@ class HingeObj : public ObjFunction {
void Configure(Args const&) override {}
ObjInfo Task() const override { return ObjInfo::kRegression; }
void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info, int /*iter*/,
HostDeviceVector<GradientPair> *out_gpair) override {
void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
std::int32_t /*iter*/, linalg::Matrix<GradientPair> *out_gpair) override {
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
CHECK_EQ(preds.Size(), info.labels.Size())
<< "labels are not correctly provided"
@@ -41,7 +41,8 @@ class HingeObj : public ObjFunction {
CHECK_EQ(info.weights_.Size(), ndata)
<< "Number of weights should be equal to number of data points.";
}
out_gpair->Resize(ndata);
CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target for `binary:hinge` is not yet supported.";
out_gpair->Reshape(ndata, 1);
common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx,
common::Span<GradientPair> _out_gpair,
@@ -63,7 +64,7 @@ class HingeObj : public ObjFunction {
},
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
ctx_->gpu_id).Eval(
out_gpair, &preds, info.labels.Data(), &info.weights_);
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
}
void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {

View File

@@ -21,7 +21,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
}
// Avoid altering any state in child objective.
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
Json config{Object{}};
this->SaveConfig(&config);

View File

@@ -109,12 +109,12 @@ class LambdaRankObj : public FitIntercept {
lj_.SetDevice(ctx_->gpu_id);
if (ctx_->IsCPU()) {
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
} else {
cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
}
@@ -165,9 +165,8 @@ class LambdaRankObj : public FitIntercept {
void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
linalg::VectorView<float const> g_label, float w,
common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
common::Span<GradientPair> g_gpair) {
std::fill_n(g_gpair.data(), g_gpair.size(), GradientPair{});
auto p_gpair = g_gpair.data();
linalg::VectorView<GradientPair> g_gpair) {
std::fill_n(g_gpair.Values().data(), g_gpair.Size(), GradientPair{});
auto ti_plus = ti_plus_.HostView();
auto tj_minus = tj_minus_.HostView();
@@ -198,8 +197,8 @@ class LambdaRankObj : public FitIntercept {
std::size_t idx_high = g_rank[rank_high];
std::size_t idx_low = g_rank[rank_low];
p_gpair[idx_high] += pg;
p_gpair[idx_low] += ng;
g_gpair(idx_high) += pg;
g_gpair(idx_low) += ng;
if (unbiased) {
auto k = ti_plus.Size();
@@ -225,12 +224,13 @@ class LambdaRankObj : public FitIntercept {
MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
if (sum_lambda > 0.0) {
double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
std::transform(g_gpair.data(), g_gpair.data() + g_gpair.size(), g_gpair.data(),
[norm](GradientPair const& g) { return g * norm; });
std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
}
auto w_norm = p_cache_->WeightNorm();
std::transform(g_gpair.begin(), g_gpair.end(), g_gpair.begin(),
std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
g_gpair.Values().data(),
[&](GradientPair const& gpair) { return gpair * w * w_norm; });
}
@@ -301,7 +301,7 @@ class LambdaRankObj : public FitIntercept {
}
void GetGradient(HostDeviceVector<float> const& predt, MetaInfo const& info, std::int32_t iter,
HostDeviceVector<GradientPair>* out_gpair) override {
linalg::Matrix<GradientPair>* out_gpair) override {
CHECK_EQ(info.labels.Size(), predt.Size()) << error::LabelScoreSize();
// init/renew cache
@@ -339,7 +339,7 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
void CalcLambdaForGroupNDCG(std::int32_t iter, common::Span<float const> g_predt,
linalg::VectorView<float const> g_label, float w,
common::Span<std::size_t const> g_rank,
common::Span<GradientPair> g_gpair,
linalg::VectorView<GradientPair> g_gpair,
linalg::VectorView<double const> inv_IDCG,
common::Span<double const> discount, bst_group_t g) {
auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
@@ -351,20 +351,22 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
}
void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
if (ctx_->IsCUDA()) {
cuda_impl::LambdaRankGetGradientNDCG(
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
out_gpair);
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), out_gpair);
return;
}
bst_group_t n_groups = p_cache_->Groups();
auto gptr = p_cache_->DataGroupPtr(ctx_);
out_gpair->Resize(info.num_row_);
auto h_gpair = out_gpair->HostSpan();
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, 1);
auto h_gpair = out_gpair->HostView();
auto h_predt = predt.ConstHostSpan();
auto h_label = info.labels.HostView();
auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -378,7 +380,8 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
std::size_t cnt = gptr[g + 1] - gptr[g];
auto w = h_weight[g];
auto g_predt = h_predt.subspan(gptr[g], cnt);
auto g_gpair = h_gpair.subspan(gptr[g], cnt);
auto g_gpair =
h_gpair.Slice(linalg::Range(static_cast<std::size_t>(gptr[g]), gptr[g] + cnt), 0);
auto g_label = h_label.Slice(make_range(g), 0);
auto g_rank = rank_idx.subspan(gptr[g], cnt);
@@ -420,7 +423,7 @@ void LambdaRankGetGradientNDCG(Context const*, std::int32_t, HostDeviceVector<fl
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double>, linalg::VectorView<double>,
HostDeviceVector<GradientPair>*) {
linalg::Matrix<GradientPair>*) {
common::AssertGPUSupport();
}
@@ -470,20 +473,23 @@ void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
public:
void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
if (ctx_->IsCUDA()) {
return cuda_impl::LambdaRankGetGradientMAP(
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
out_gpair);
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), out_gpair);
}
auto gptr = p_cache_->DataGroupPtr(ctx_).data();
bst_group_t n_groups = p_cache_->Groups();
out_gpair->Resize(info.num_row_);
auto h_gpair = out_gpair->HostSpan();
CHECK_EQ(info.labels.Shape(1), 1) << "multi-target for learning to rank is not yet supported.";
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto h_gpair = out_gpair->HostView();
auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
auto h_predt = predt.ConstHostSpan();
auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
@@ -514,7 +520,7 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
auto cnt = gptr[g + 1] - gptr[g];
auto w = h_weight[g];
auto g_predt = h_predt.subspan(gptr[g], cnt);
auto g_gpair = h_gpair.subspan(gptr[g], cnt);
auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
auto g_label = h_label.Slice(make_range(g));
auto g_rank = rank_idx.subspan(gptr[g], cnt);
@@ -545,7 +551,7 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double>, linalg::VectorView<double>,
HostDeviceVector<GradientPair>*) {
linalg::Matrix<GradientPair>*) {
common::AssertGPUSupport();
}
} // namespace cuda_impl
@@ -557,20 +563,22 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::RankingCache> {
public:
void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
if (ctx_->IsCUDA()) {
return cuda_impl::LambdaRankGetGradientPairwise(
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
out_gpair);
ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), out_gpair);
}
auto gptr = p_cache_->DataGroupPtr(ctx_);
bst_group_t n_groups = p_cache_->Groups();
out_gpair->Resize(info.num_row_);
auto h_gpair = out_gpair->HostSpan();
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto h_gpair = out_gpair->HostView();
auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
auto h_predt = predt.ConstHostSpan();
auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -585,7 +593,7 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
auto cnt = gptr[g + 1] - gptr[g];
auto w = h_weight[g];
auto g_predt = h_predt.subspan(gptr[g], cnt);
auto g_gpair = h_gpair.subspan(gptr[g], cnt);
auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
auto g_label = h_label.Slice(make_range(g));
auto g_rank = rank_idx.subspan(gptr[g], cnt);
@@ -611,7 +619,7 @@ void LambdaRankGetGradientPairwise(Context const*, std::int32_t, HostDeviceVecto
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double const>, // input bias ratio
linalg::VectorView<double>, linalg::VectorView<double>,
HostDeviceVector<GradientPair>*) {
linalg::Matrix<GradientPair>*) {
common::AssertGPUSupport();
}
} // namespace cuda_impl

Some files were not shown because too many files have changed in this diff Show More