enable ROCm on latest XGBoost

This commit is contained in:
Hui Liu
2023-10-23 11:07:08 -07:00
328 changed files with 8028 additions and 3642 deletions

View File

@@ -11,10 +11,10 @@ set_source_files_properties(
PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
if (USE_CUDA)
if(USE_CUDA)
file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
endif (USE_CUDA)
endif()
if (USE_HIP)
file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
@@ -27,9 +27,9 @@ target_include_directories(objxgboost
${xgboost_SOURCE_DIR}/dmlc-core/include
${xgboost_SOURCE_DIR}/rabit/include)
if (LOG_CAPI_INVOCATION)
if(LOG_CAPI_INVOCATION)
target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
endif (LOG_CAPI_INVOCATION)
endif()
# For MSVC: Call msvc_use_static_runtime() once again to completely
# replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462

View File

@@ -271,8 +271,8 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
if (cache_info != nullptr) {
scache = cache_info;
}
xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
XGBoostBatchCSR> adapter(data_handle, callback);
xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> adapter(
data_handle, callback);
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix> {
DMatrix::Create(
@@ -447,8 +447,11 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
auto data_split_mode =
static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
*out = new std::shared_ptr<DMatrix>(
DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
API_END();
}
@@ -483,8 +486,11 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
auto data_split_mode =
static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
*out = new std::shared_ptr<DMatrix>(
DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
API_END();
}
@@ -534,33 +540,8 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
API_END();
}
XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
void *ptr_schema) {
API_BEGIN();
static_cast<data::RecordBatchesIterAdapter *>(data_handle)
->SetData(static_cast<struct ArrowArray *>(ptr_array),
static_cast<struct ArrowSchema *>(ptr_schema));
API_END();
}
XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
DMatrixHandle *out) {
API_BEGIN();
xgboost_CHECK_C_ARG_PTR(config);
auto jconfig = Json::Load(StringView{config});
auto missing = GetMissing(jconfig);
auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
data::RecordBatchesIterAdapter adapter(next, n_batches);
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}
XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
const int* idxset,
xgboost::bst_ulong len,
DMatrixHandle* out) {
XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
DMatrixHandle *out) {
xgboost_CHECK_C_ARG_PTR(out);
return XGDMatrixSliceDMatrixEx(handle, idxset, len, out, 0);
}
@@ -749,6 +730,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle const handle, xgboost::bst_ulon
API_END();
}
XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = CastDMatrixHandle(handle);
xgboost_CHECK_C_ARG_PTR(out);
*out = static_cast<xgboost::bst_ulong>(p_m->Info().data_split_mode);
API_END();
}
XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
xgboost::bst_ulong *out_indptr, unsigned *out_indices,
float *out_data) {
@@ -1375,29 +1365,6 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
API_END();
}
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_len,
const char **out_dptr) {
API_BEGIN();
CHECK_HANDLE();
auto *learner = static_cast<Learner*>(handle);
std::string& raw_str = learner->GetThreadLocal().ret_str;
raw_str.resize(0);
common::MemoryBufferStream fo(&raw_str);
LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
learner->Configure();
learner->SaveModel(&fo);
xgboost_CHECK_C_ARG_PTR(out_dptr);
xgboost_CHECK_C_ARG_PTR(out_len);
*out_dptr = dmlc::BeginPtr(raw_str);
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
API_END();
}
// The following two functions are `Load` and `Save` for memory based
// serialization methods. E.g. Python pickle.
XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, xgboost::bst_ulong *out_len,
@@ -1432,36 +1399,13 @@ XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
API_END();
}
XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
int* version) {
API_BEGIN();
CHECK_HANDLE();
auto* bst = static_cast<Learner*>(handle);
xgboost_CHECK_C_ARG_PTR(version);
*version = rabit::LoadCheckPoint();
if (*version != 0) {
bst->Configure();
}
API_END();
}
XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
API_BEGIN();
CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle);
learner->Configure();
rabit::CheckPoint();
API_END();
}
XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
int end_layer, int step,
XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer, int end_layer, int step,
BoosterHandle *out) {
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(out);
auto* learner = static_cast<Learner*>(handle);
auto *learner = static_cast<Learner *>(handle);
bool out_of_bound = false;
auto p_out = learner->Slice(begin_layer, end_layer, step, &out_of_bound);
if (out_of_bound) {
@@ -1797,7 +1741,7 @@ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int
}
#if defined(XGBOOST_USE_FEDERATED)
XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_key_path,
XGB_DLL int XGBRunFederatedServer(int port, std::size_t world_size, char const *server_key_path,
char const *server_cert_path, char const *client_cert_path) {
API_BEGIN();
federated::RunServer(port, world_size, server_key_path, server_cert_path, client_cert_path);
@@ -1805,7 +1749,7 @@ XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_k
}
// Run a server without SSL for local testing.
XGB_DLL int XGBRunInsecureFederatedServer(int port, int world_size) {
XGB_DLL int XGBRunInsecureFederatedServer(int port, std::size_t world_size) {
API_BEGIN();
federated::RunInsecureServer(port, world_size);
API_END();

View File

@@ -75,7 +75,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
auto hess_dev = dh::CudaGetPointerDevice(hess.data);
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
auto &gpair = *out_gpair;
gpair.SetDevice(grad_dev);
gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
gpair.Reshape(grad.Shape(0), grad.Shape(1));
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
auto cuctx = ctx->CUDACtx();
@@ -153,7 +153,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
if (learner->Ctx()->IsCUDA()) {
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
}
p_predt->SetDevice(proxy->DeviceIdx());
p_predt->SetDevice(proxy->Device());
auto &shape = learner->GetThreadLocal().prediction_shape;
size_t n_samples = p_m->Info().num_row_;

View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021-2023 by XGBoost Contributors
* Copyright 2021-2023, XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
@@ -13,6 +13,7 @@
#include <utility> // for move
#include <vector>
#include "../common/json_utils.h" // for TypeCheck
#include "xgboost/c_api.h"
#include "xgboost/data.h" // DMatrix
#include "xgboost/feature_map.h" // for FeatureMap
@@ -254,28 +255,6 @@ inline void GenerateFeatureMap(Learner const *learner,
void XGBBuildInfoDevice(Json* p_info);
template <typename JT>
auto const &RequiredArg(Json const &in, StringView key, StringView func) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it == obj.cend() || IsA<Null>(it->second)) {
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
}
TypeCheck<JT>(it->second, StringView{key});
return get<std::remove_const_t<JT> const>(it->second);
}
template <typename JT, typename T>
auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it != obj.cend() && !IsA<Null>(it->second)) {
TypeCheck<JT>(it->second, key);
return get<std::remove_const_t<JT> const>(it->second);
}
return dft;
}
/**
* \brief Get shared ptr from DMatrix C handle with additional checks.
*/

View File

@@ -15,8 +15,7 @@
#include "communicator-inl.cuh"
namespace xgboost {
namespace collective {
namespace xgboost::collective {
/**
* @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
* @param size Number of values to sum.
*/
template <typename T>
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
if (info.IsRowSplit()) {
collective::AllReduce<collective::Operation::kSum>(device, values, size);
collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
}
}
} // namespace collective
} // namespace xgboost
} // namespace xgboost::collective

View File

@@ -0,0 +1,88 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include "allgather.h"
#include <algorithm> // for min, copy_n
#include <cstddef> // for size_t
#include <cstdint> // for int8_t, int32_t, int64_t
#include <memory> // for shared_ptr
#include <numeric> // for partial_sum
#include <vector> // for vector
#include "comm.h" // for Comm, Channel
#include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span
namespace xgboost::collective::cpu_impl {
Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
std::int32_t worker_off, std::shared_ptr<Channel> prev_ch,
std::shared_ptr<Channel> next_ch) {
auto world = comm.World();
auto rank = comm.Rank();
CHECK_LT(worker_off, world);
for (std::int32_t r = 0; r < world; ++r) {
auto send_rank = (rank + world - r + worker_off) % world;
auto send_off = send_rank * segment_size;
send_off = std::min(send_off, data.size_bytes());
auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
auto recv_rank = (rank + world - r - 1 + worker_off) % world;
auto recv_off = recv_rank * segment_size;
recv_off = std::min(recv_off, data.size_bytes());
auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
auto rc = prev_ch->Block();
if (!rc.OK()) {
return rc;
}
}
return Success();
}
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
common::Span<std::int8_t const> data,
common::Span<std::int8_t> erased_result) {
auto world = comm.World();
auto rank = comm.Rank();
auto prev = BootstrapPrev(rank, comm.World());
auto next = BootstrapNext(rank, comm.World());
auto prev_ch = comm.Chan(prev);
auto next_ch = comm.Chan(next);
// get worker offset
std::vector<std::int64_t> offset(world + 1, 0);
std::partial_sum(sizes.cbegin(), sizes.cend(), offset.begin() + 1);
CHECK_EQ(*offset.cbegin(), 0);
// copy data
auto current = erased_result.subspan(offset[rank], data.size_bytes());
auto erased_data = EraseType(data);
std::copy_n(erased_data.data(), erased_data.size(), current.data());
for (std::int32_t r = 0; r < world; ++r) {
auto send_rank = (rank + world - r) % world;
auto send_off = offset[send_rank];
auto send_size = sizes[send_rank];
auto send_seg = erased_result.subspan(send_off, send_size);
next_ch->SendAll(send_seg);
auto recv_rank = (rank + world - r - 1) % world;
auto recv_off = offset[recv_rank];
auto recv_size = sizes[recv_rank];
auto recv_seg = erased_result.subspan(recv_off, recv_size);
prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
auto rc = prev_ch->Block();
if (!rc.OK()) {
return rc;
}
}
return comm.Block();
}
} // namespace xgboost::collective::cpu_impl

View File

@@ -0,0 +1,72 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <type_traits> // for remove_cv_t
#include <vector> // for vector
#include "comm.h" // for Comm, Channel, EraseType
#include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span
namespace xgboost::collective {
namespace cpu_impl {
/**
* @param worker_off Segment offset. For example, if the rank 2 worker specifis worker_off
* = 1, then it owns the third segment.
*/
[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
std::size_t segment_size, std::int32_t worker_off,
std::shared_ptr<Channel> prev_ch,
std::shared_ptr<Channel> next_ch);
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
common::Span<std::int8_t const> data,
common::Span<std::int8_t> erased_result);
} // namespace cpu_impl
template <typename T>
[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
auto n_bytes = sizeof(T) * size;
auto erased = EraseType(data);
auto rank = comm.Rank();
auto prev = BootstrapPrev(rank, comm.World());
auto next = BootstrapNext(rank, comm.World());
auto prev_ch = comm.Chan(prev);
auto next_ch = comm.Chan(next);
auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
if (!rc.OK()) {
return rc;
}
return comm.Block();
}
template <typename T>
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<T> data,
std::vector<std::remove_cv_t<T>>* p_out) {
auto world = comm.World();
auto rank = comm.Rank();
std::vector<std::int64_t> sizes(world, 0);
sizes[rank] = data.size_bytes();
auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
if (!rc.OK()) {
return rc;
}
std::vector<T>& result = *p_out;
auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
result.resize(n_total_bytes / sizeof(T));
auto h_result = common::Span{result.data(), result.size()};
auto erased_result = EraseType(h_result);
auto erased_data = EraseType(data);
return cpu_impl::RingAllgatherV(comm, sizes, erased_data, erased_result);
}
} // namespace xgboost::collective

View File

@@ -0,0 +1,90 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include "allreduce.h"
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, int8_t
#include <vector> // for vector
#include "../data/array_interface.h" // for Type, DispatchDType
#include "allgather.h" // for RingAllgather
#include "comm.h" // for Comm
#include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span
namespace xgboost::collective::cpu_impl {
template <typename T>
Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
std::size_t n_bytes_in_seg, Func const& op) {
auto rank = comm.Rank();
auto world = comm.World();
auto dst_rank = BootstrapNext(rank, world);
auto src_rank = BootstrapPrev(rank, world);
auto next_ch = comm.Chan(dst_rank);
auto prev_ch = comm.Chan(src_rank);
std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
auto s_buf = common::Span{buffer.data(), buffer.size()};
for (std::int32_t r = 0; r < world - 1; ++r) {
// send to ring next
auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
send_off = std::min(send_off, data.size_bytes());
auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
auto send_seg = data.subspan(send_off, seg_nbytes);
next_ch->SendAll(send_seg);
// receive from ring prev
auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
recv_off = std::min(recv_off, data.size_bytes());
seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
CHECK_EQ(seg_nbytes % sizeof(T), 0);
auto recv_seg = data.subspan(recv_off, seg_nbytes);
auto seg = s_buf.subspan(0, recv_seg.size());
prev_ch->RecvAll(seg);
auto rc = prev_ch->Block();
if (!rc.OK()) {
return rc;
}
// accumulate to recv_seg
CHECK_EQ(seg.size(), recv_seg.size());
op(seg, recv_seg);
}
return Success();
}
Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
ArrayInterfaceHandler::Type type) {
return DispatchDType(type, [&](auto t) {
using T = decltype(t);
// Divide the data into segments according to the number of workers.
auto n_bytes_elem = sizeof(T);
CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
auto n = data.size_bytes() / n_bytes_elem;
auto world = comm.World();
auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
if (!rc.OK()) {
return rc;
}
auto prev = BootstrapPrev(comm.Rank(), comm.World());
auto next = BootstrapNext(comm.Rank(), comm.World());
auto prev_ch = comm.Chan(prev);
auto next_ch = comm.Chan(next);
rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
if (!rc.OK()) {
return rc;
}
return comm.Block();
});
}
} // namespace xgboost::collective::cpu_impl

View File

@@ -0,0 +1,39 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <cstdint> // for int8_t
#include <functional> // for function
#include <type_traits> // for is_invocable_v
#include "../data/array_interface.h" // for ArrayInterfaceHandler
#include "comm.h" // for Comm, RestoreType
#include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span
namespace xgboost::collective {
namespace cpu_impl {
using Func =
std::function<void(common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out)>;
Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
ArrayInterfaceHandler::Type type);
} // namespace cpu_impl
template <typename T, typename Fn>
std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>, Result> Allreduce(
Comm const& comm, common::Span<T> data, Fn redop) {
auto erased = EraseType(data);
auto type = ToDType<T>::kType;
auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
common::Span<std::int8_t> out) {
CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
auto lhs_t = RestoreType<T const>(lhs);
auto rhs_t = RestoreType<T>(out);
redop(lhs_t, rhs_t);
};
return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
}
} // namespace xgboost::collective

View File

@@ -0,0 +1,84 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include "broadcast.h"
#include <cmath> // for ceil, log2
#include <cstdint> // for int32_t, int8_t
#include <utility> // for move
#include "../common/bitfield.h" // for TrailingZeroBits, RBitField32
#include "comm.h" // for Comm
#include "xgboost/collective/result.h" // for Result
#include "xgboost/span.h" // for Span
namespace xgboost::collective::cpu_impl {
namespace {
std::int32_t ShiftedParentRank(std::int32_t shifted_rank, std::int32_t depth) {
std::uint32_t mask{std::uint32_t{0} - 1}; // Oxff...
RBitField32 maskbits{common::Span<std::uint32_t>{&mask, 1}};
RBitField32 rankbits{
common::Span<std::uint32_t>{reinterpret_cast<std::uint32_t*>(&shifted_rank), 1}};
// prepare for counting trailing zeros.
for (std::int32_t i = 0; i < depth + 1; ++i) {
if (rankbits.Check(i)) {
maskbits.Set(i);
} else {
maskbits.Clear(i);
}
}
CHECK_NE(mask, 0);
auto k = TrailingZeroBits(mask);
auto shifted_parent = shifted_rank - (1 << k);
return shifted_parent;
}
// Shift the root node to rank 0
std::int32_t ShiftLeft(std::int32_t rank, std::int32_t world, std::int32_t root) {
auto shifted_rank = (rank + world - root) % world;
return shifted_rank;
}
// shift back to the original rank
std::int32_t ShiftRight(std::int32_t rank, std::int32_t world, std::int32_t root) {
auto orig = (rank + root) % world;
return orig;
}
} // namespace
Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root) {
// Binomial tree broadcast
// * Wiki
// https://en.wikipedia.org/wiki/Broadcast_(parallel_pattern)#Binomial_Tree_Broadcast
// * Impl
// https://people.mpi-inf.mpg.de/~mehlhorn/ftp/NewToolbox/collective.pdf
auto rank = comm.Rank();
auto world = comm.World();
// shift root to rank 0
auto shifted_rank = ShiftLeft(rank, world, root);
std::int32_t depth = std::ceil(std::log2(static_cast<double>(world))) - 1;
if (shifted_rank != 0) { // not root
auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
comm.Chan(parent)->RecvAll(data);
auto rc = comm.Chan(parent)->Block();
if (!rc.OK()) {
return Fail("broadcast failed.", std::move(rc));
}
}
for (std::int32_t i = depth; i >= 0; --i) {
CHECK_GE((i + 1), 0); // weird clang-tidy error that i might be negative
if (shifted_rank % (1 << (i + 1)) == 0 && shifted_rank + (1 << i) < world) {
auto sft_peer = shifted_rank + (1 << i);
auto peer = ShiftRight(sft_peer, world, root);
CHECK_NE(peer, root);
comm.Chan(peer)->SendAll(data);
}
}
return comm.Block();
}
} // namespace xgboost::collective::cpu_impl

View File

@@ -0,0 +1,26 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <cstdint> // for int32_t, int8_t
#include "comm.h" // for Comm
#include "xgboost/collective/result.h" // for
#include "xgboost/span.h" // for Span
namespace xgboost::collective {
namespace cpu_impl {
Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root);
}
/**
* @brief binomial tree broadcast is used on CPU with the default implementation.
*/
template <typename T>
[[nodiscard]] Result Broadcast(Comm const& comm, common::Span<T> data, std::int32_t root) {
auto n_total_bytes = data.size_bytes();
auto erased =
common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
return cpu_impl::Broadcast(comm, erased, root);
}
} // namespace xgboost::collective

304
src/collective/comm.cc Normal file
View File

@@ -0,0 +1,304 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include "comm.h"
#include <algorithm> // for copy
#include <chrono> // for seconds
#include <memory> // for shared_ptr
#include <string> // for string
#include <utility> // for move, forward
#include "allgather.h"
#include "protocol.h" // for kMagic
#include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE
#include "xgboost/collective/socket.h" // for TCPSocket
#include "xgboost/json.h" // for Json, Object
#include "xgboost/string_view.h" // for StringView
namespace xgboost::collective {
Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t retry, std::string task_id)
: timeout_{timeout},
retry_{retry},
tracker_{host, port, -1},
task_id_{std::move(task_id)},
loop_{std::make_shared<Loop>(timeout)} {}
Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
std::string const& task_id, TCPSocket* out, std::int32_t rank,
std::int32_t world) {
// get information from tracker
CHECK(!info.host.empty());
auto rc = Connect(info.host, info.port, retry, timeout, out);
if (!rc.OK()) {
return Fail("Failed to connect to the tracker.", std::move(rc));
}
TCPSocket& tracker = *out;
return std::move(rc)
<< [&] { return tracker.NonBlocking(false); }
<< [&] { return tracker.RecvTimeout(timeout); }
<< [&] { return proto::Magic{}.Verify(&tracker); }
<< [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
}
[[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
return ConnectTrackerImpl(this->TrackerInfo(), this->Timeout(), this->retry_, this->task_id_, out,
this->Rank(), this->World());
}
[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
proto::PeerInfo ninfo, std::chrono::seconds timeout,
std::int32_t retry,
std::vector<std::shared_ptr<TCPSocket>>* out_workers) {
auto next = std::make_shared<TCPSocket>();
auto prev = std::make_shared<TCPSocket>();
auto rc = Success() << [&] {
auto rc = Connect(ninfo.host, ninfo.port, retry, timeout, next.get());
if (!rc.OK()) {
return Fail("Bootstrap failed to connect to ring next.", std::move(rc));
}
return rc;
} << [&] {
return next->NonBlocking(true);
} << [&] {
SockAddrV4 addr;
return listener->Accept(prev.get(), &addr);
} << [&] { return prev->NonBlocking(true); };
if (!rc.OK()) {
return rc;
}
// exchange host name and port
std::vector<std::int8_t> buffer(HOST_NAME_MAX * comm.World(), 0);
auto s_buffer = common::Span{buffer.data(), buffer.size()};
auto next_host = s_buffer.subspan(HOST_NAME_MAX * comm.Rank(), HOST_NAME_MAX);
if (next_host.size() < ninfo.host.size()) {
return Fail("Got an invalid host name.");
}
std::copy(ninfo.host.cbegin(), ninfo.host.cend(), next_host.begin());
auto prev_ch = std::make_shared<Channel>(comm, prev);
auto next_ch = std::make_shared<Channel>(comm, next);
auto block = [&] {
for (auto ch : {prev_ch, next_ch}) {
auto rc = ch->Block();
if (!rc.OK()) {
return rc;
}
}
return Success();
};
rc = std::move(rc) << [&] {
return cpu_impl::RingAllgather(comm, s_buffer, HOST_NAME_MAX, 0, prev_ch, next_ch);
} << [&] { return block(); };
if (!rc.OK()) {
return Fail("Failed to get host names from peers.", std::move(rc));
}
std::vector<std::int32_t> peers_port(comm.World(), -1);
peers_port[comm.Rank()] = ninfo.port;
rc = std::move(rc) << [&] {
auto s_ports = common::Span{reinterpret_cast<std::int8_t*>(peers_port.data()),
peers_port.size() * sizeof(ninfo.port)};
return cpu_impl::RingAllgather(comm, s_ports, sizeof(ninfo.port), 0, prev_ch, next_ch);
} << [&] { return block(); };
if (!rc.OK()) {
return Fail("Failed to get the port from peers.", std::move(rc));
}
std::vector<proto::PeerInfo> peers(comm.World());
for (auto r = 0; r < comm.World(); ++r) {
auto nhost = s_buffer.subspan(HOST_NAME_MAX * r, HOST_NAME_MAX);
auto nport = peers_port[r];
auto nrank = BootstrapNext(r, comm.World());
peers[nrank] = {std::string{reinterpret_cast<char const*>(nhost.data())}, nport, nrank};
}
CHECK_EQ(peers[comm.Rank()].port, lport);
for (auto const& p : peers) {
CHECK_NE(p.port, -1);
}
std::vector<std::shared_ptr<TCPSocket>>& workers = *out_workers;
workers.resize(comm.World());
for (std::int32_t r = (comm.Rank() + 1); r < comm.World(); ++r) {
auto const& peer = peers[r];
std::shared_ptr<TCPSocket> worker{TCPSocket::CreatePtr(comm.Domain())};
rc = std::move(rc)
<< [&] { return Connect(peer.host, peer.port, retry, timeout, worker.get()); }
<< [&] { return worker->RecvTimeout(timeout); };
if (!rc.OK()) {
return rc;
}
auto rank = comm.Rank();
auto n_bytes = worker->SendAll(&rank, sizeof(comm.Rank()));
if (n_bytes != sizeof(comm.Rank())) {
return Fail("Failed to send rank.");
}
workers[r] = std::move(worker);
}
for (std::int32_t r = 0; r < comm.Rank(); ++r) {
SockAddrV4 addr;
auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
<< [&] { return peer->RecvTimeout(timeout); };
if (!rc.OK()) {
return rc;
}
std::int32_t rank{-1};
auto n_bytes = peer->RecvAll(&rank, sizeof(rank));
if (n_bytes != sizeof(comm.Rank())) {
return Fail("Failed to recv rank.");
}
workers[rank] = std::move(peer);
}
for (std::int32_t r = 0; r < comm.World(); ++r) {
if (r == comm.Rank()) {
continue;
}
CHECK(workers[r]);
}
return Success();
}
RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t retry, std::string task_id)
: Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
auto rc = this->Bootstrap(timeout_, retry_, task_id_);
CHECK(rc.OK()) << rc.Report();
}
[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
std::string task_id) {
TCPSocket tracker;
std::int32_t world{-1};
auto rc = ConnectTrackerImpl(this->TrackerInfo(), timeout, retry, task_id, &tracker, this->Rank(),
world);
if (!rc.OK()) {
return Fail("Bootstrap failed.", std::move(rc));
}
this->domain_ = tracker.Domain();
// Start command
TCPSocket listener = TCPSocket::Create(tracker.Domain());
std::int32_t lport = listener.BindHost();
listener.Listen();
// create worker for listening to error notice.
auto domain = tracker.Domain();
std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
auto eport = error_sock->BindHost();
error_sock->Listen();
error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
auto conn = error_sock->Accept();
// On Windows accept returns an invalid socket after network is shutdown.
if (conn.IsClosed()) {
return;
}
LOG(WARNING) << "Another worker is running into error.";
std::string scmd;
conn.Recv(&scmd);
auto jcmd = Json::Load(scmd);
auto rc = this->Shutdown();
if (!rc.OK()) {
LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
}
#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
exit(-1);
#else
LOG(FATAL) << rc.Report();
#endif
}};
error_worker_.detach();
proto::Start start;
rc = std::move(rc) << [&] { return start.WorkerSend(lport, &tracker, eport); }
<< [&] { return start.WorkerRecv(&tracker, &world); };
if (!rc.OK()) {
return rc;
}
this->world_ = world;
// get ring neighbors
std::string snext;
tracker.Recv(&snext);
auto jnext = Json::Load(StringView{snext});
proto::PeerInfo ninfo{jnext};
// get the rank of this worker
this->rank_ = BootstrapPrev(ninfo.rank, world);
this->tracker_.rank = rank_;
std::vector<std::shared_ptr<TCPSocket>> workers;
rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
if (!rc.OK()) {
return rc;
}
CHECK(this->channels_.empty());
for (auto& w : workers) {
if (w) {
w->SetNoDelay();
rc = w->NonBlocking(true);
}
if (!rc.OK()) {
return rc;
}
this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
}
return rc;
}
RabitComm::~RabitComm() noexcept(false) {
if (!IsDistributed()) {
return;
}
auto rc = this->Shutdown();
if (!rc.OK()) {
LOG(WARNING) << rc.Report();
}
}
[[nodiscard]] Result RabitComm::Shutdown() {
TCPSocket tracker;
return Success() << [&] {
return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
} << [&] {
return this->Block();
} << [&] {
Json jcmd{Object{}};
jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
auto scmd = Json::Dump(jcmd);
auto n_bytes = tracker.Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Faled to send cmd.");
}
return Success();
};
}
[[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
TCPSocket out;
proto::Print print;
return Success() << [&] { return this->ConnectTracker(&out); }
<< [&] { return print.WorkerSend(&out, msg); };
}
[[nodiscard]] Result RabitComm::SignalError(Result const& res) {
TCPSocket out;
return Success() << [&] { return this->ConnectTracker(&out); }
<< [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
}
} // namespace xgboost::collective

156
src/collective/comm.h Normal file
View File

@@ -0,0 +1,156 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <chrono> // for seconds
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <memory> // for shared_ptr
#include <string> // for string
#include <thread> // for thread
#include <type_traits> // for remove_const_t
#include <utility> // for move
#include <vector> // for vector
#include "loop.h" // for Loop
#include "protocol.h" // for PeerInfo
#include "xgboost/collective/result.h" // for Result
#include "xgboost/collective/socket.h" // for TCPSocket
#include "xgboost/span.h" // for Span
namespace xgboost::collective {
inline constexpr std::int32_t DefaultTimeoutSec() { return 300; } // 5min
inline constexpr std::int32_t DefaultRetry() { return 3; }
// indexing into the ring
inline std::int32_t BootstrapNext(std::int32_t r, std::int32_t world) {
auto nrank = (r + world + 1) % world;
return nrank;
}
inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
auto nrank = (r + world - 1) % world;
return nrank;
}
class Channel;
/**
* @brief Base communicator storing info about the tracker and other communicators.
*/
class Comm {
protected:
std::int32_t world_{1};
std::int32_t rank_{0};
std::chrono::seconds timeout_{DefaultTimeoutSec()};
std::int32_t retry_{DefaultRetry()};
proto::PeerInfo tracker_;
SockDomain domain_{SockDomain::kV4};
std::thread error_worker_;
std::string task_id_;
std::vector<std::shared_ptr<Channel>> channels_;
std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
DefaultTimeoutSec()}}}; // fixme: require federated comm to have a timeout
public:
Comm() = default;
Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, std::int32_t retry,
std::string task_id);
virtual ~Comm() noexcept(false) {} // NOLINT
Comm(Comm const& that) = delete;
Comm& operator=(Comm const& that) = delete;
Comm(Comm&& that) = delete;
Comm& operator=(Comm&& that) = delete;
[[nodiscard]] auto TrackerInfo() const { return tracker_; }
[[nodiscard]] Result ConnectTracker(TCPSocket* out) const;
[[nodiscard]] auto Domain() const { return domain_; }
[[nodiscard]] auto Timeout() const { return timeout_; }
[[nodiscard]] auto Rank() const { return rank_; }
[[nodiscard]] auto World() const { return world_; }
[[nodiscard]] bool IsDistributed() const { return World() > 1; }
void Submit(Loop::Op op) const { loop_->Submit(op); }
[[nodiscard]] Result Block() const { return loop_->Block(); }
[[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
return channels_.at(rank);
}
[[nodiscard]] virtual bool IsFederated() const = 0;
[[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
[[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
};
class RabitComm : public Comm {
[[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
std::string task_id);
[[nodiscard]] Result Shutdown();
public:
// bootstrapping construction.
RabitComm() = default;
// ctor for testing where environment is known.
RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t retry, std::string task_id);
~RabitComm() noexcept(false) override;
[[nodiscard]] bool IsFederated() const override { return false; }
[[nodiscard]] Result LogTracker(std::string msg) const override;
[[nodiscard]] Result SignalError(Result const&) override;
};
/**
* @brief Communication channel between workers.
*/
class Channel {
std::shared_ptr<TCPSocket> sock_{nullptr};
Result rc_;
Comm const& comm_;
public:
explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
: sock_{std::move(sock)}, comm_{comm} {}
void SendAll(std::int8_t const* ptr, std::size_t n) {
Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
CHECK(sock_.get());
comm_.Submit(std::move(op));
}
void SendAll(common::Span<std::int8_t const> data) {
this->SendAll(data.data(), data.size_bytes());
}
void RecvAll(std::int8_t* ptr, std::size_t n) {
Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
CHECK(sock_.get());
comm_.Submit(std::move(op));
}
void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
[[nodiscard]] auto Socket() const { return sock_; }
[[nodiscard]] Result Block() { return comm_.Block(); }
};
enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 };
template <typename T, typename U = std::conditional_t<std::is_const_v<T>,
std::add_const_t<std::int8_t>, std::int8_t>>
common::Span<U> EraseType(common::Span<T> data) {
auto n_total_bytes = data.size_bytes();
auto erased = common::Span{reinterpret_cast<std::add_pointer_t<U>>(data.data()), n_total_bytes};
return erased;
}
template <typename T, typename U>
common::Span<T> RestoreType(common::Span<U> data) {
static_assert(std::is_same_v<std::remove_const_t<U>, std::int8_t>);
auto n_total_bytes = data.size_bytes();
auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
return restored;
}
} // namespace xgboost::collective

View File

@@ -57,9 +57,7 @@ namespace collective {
* - federated_client_key: Client key file path. Only needed for the SSL mode.
* - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
*/
inline void Init(Json const& config) {
Communicator::Init(config);
}
inline void Init(Json const &config) { Communicator::Init(config); }
/*!
* \brief Finalize the collective communicator.
@@ -141,17 +139,89 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
}
}
/**
* @brief Gathers a single value all processes and distributes the result to all processes.
*
* @param input The single value.
*/
template <typename T>
inline std::vector<T> Allgather(T const &input) {
std::string_view str_input{reinterpret_cast<char const *>(&input), sizeof(T)};
auto const output = Communicator::Get()->AllGather(str_input);
CHECK_EQ(output.size() % sizeof(T), 0);
std::vector<T> result(output.size() / sizeof(T));
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
return result;
}
/**
* @brief Gathers data from all processes and distributes it to all processes.
*
* This assumes all ranks have the same size, and input data has been sliced into the
* corresponding position.
* This assumes all ranks have the same size.
*
* @param send_receive_buffer Buffer storing the data.
* @param size Size of the data in bytes.
* @param input Buffer storing the data.
*/
inline void Allgather(void *send_receive_buffer, std::size_t size) {
Communicator::Get()->AllGather(send_receive_buffer, size);
template <typename T>
inline std::vector<T> Allgather(std::vector<T> const &input) {
if (input.empty()) {
return input;
}
std::string_view str_input{reinterpret_cast<char const *>(input.data()),
input.size() * sizeof(T)};
auto const output = Communicator::Get()->AllGather(str_input);
CHECK_EQ(output.size() % sizeof(T), 0);
std::vector<T> result(output.size() / sizeof(T));
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
return result;
}
/**
* @brief Gathers variable-length data from all processes and distributes it to all processes.
* @param input Buffer storing the data.
*/
template <typename T>
inline std::vector<T> AllgatherV(std::vector<T> const &input) {
std::string_view str_input{reinterpret_cast<char const *>(input.data()),
input.size() * sizeof(T)};
auto const output = Communicator::Get()->AllGatherV(str_input);
CHECK_EQ(output.size() % sizeof(T), 0);
std::vector<T> result(output.size() / sizeof(T));
if (!output.empty()) {
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
}
return result;
}
/**
* @brief Gathers variable-length strings from all processes and distributes them to all processes.
* @param input Variable-length list of variable-length strings.
*/
inline std::vector<std::string> AllgatherStrings(std::vector<std::string> const &input) {
std::size_t total_size{0};
for (auto const &s : input) {
total_size += s.length() + 1; // +1 for null-terminators
}
std::string flat_string;
flat_string.reserve(total_size);
for (auto const &s : input) {
flat_string.append(s);
flat_string.push_back('\0'); // Append a null-terminator after each string
}
auto const output = Communicator::Get()->AllGatherV(flat_string);
std::vector<std::string> result;
std::size_t start_index = 0;
// Iterate through the output, find each null-terminated substring.
for (std::size_t i = 0; i < output.size(); i++) {
if (output[i] == '\0') {
// Construct a std::string from the char* substring
result.emplace_back(&output[start_index]);
// Move to the next substring
start_index = i + 1;
}
}
return result;
}
/*!
@@ -226,7 +296,7 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
}
template <typename T>
struct AllgatherVResult {
struct SpecialAllgatherVResult {
std::vector<std::size_t> offsets;
std::vector<std::size_t> sizes;
std::vector<T> result;
@@ -241,14 +311,10 @@ struct AllgatherVResult {
* @param sizes Sizes of each input.
*/
template <typename T>
inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
std::vector<std::size_t> const &sizes) {
auto num_inputs = sizes.size();
inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
std::vector<std::size_t> const &sizes) {
// Gather the sizes across all workers.
std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
auto const all_sizes = Allgather(sizes);
// Calculate input offsets (std::exclusive_scan).
std::vector<std::size_t> offsets(all_sizes.size());
@@ -257,11 +323,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
}
// Gather all the inputs.
auto total_input_size = offsets.back() + all_sizes.back();
std::vector<T> all_inputs(total_input_size);
std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
// We cannot use allgather here, since each worker might have a different size.
Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
auto const all_inputs = AllgatherV(inputs);
return {offsets, all_sizes, all_inputs};
}

View File

@@ -11,9 +11,7 @@
#include "../../plugin/federated/federated_communicator.h"
#endif
namespace xgboost {
namespace collective {
namespace xgboost::collective {
thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
thread_local CommunicatorType Communicator::type_{};
@@ -57,6 +55,4 @@ void Communicator::Finalize() {
communicator_.reset(new NoOpCommunicator());
}
#endif
} // namespace collective
} // namespace xgboost
} // namespace xgboost::collective

View File

@@ -125,13 +125,17 @@ class Communicator {
/**
* @brief Gathers data from all processes and distributes it to all processes.
*
* This assumes all ranks have the same size, and input data has been sliced into the
* corresponding position.
* This assumes all ranks have the same size.
*
* @param send_receive_buffer Buffer storing the data.
* @param size Size of the data in bytes.
* @param input Buffer storing the data.
*/
virtual void AllGather(void *send_receive_buffer, std::size_t size) = 0;
virtual std::string AllGather(std::string_view input) = 0;
/**
* @brief Gathers variable-length data from all processes and distributes it to all processes.
* @param input Buffer storing the data.
*/
virtual std::string AllGatherV(std::string_view input) = 0;
/**
* @brief Combines values from all processes and distributes the result back to all processes.

View File

@@ -40,12 +40,10 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
}
dh::safe_cuda(cudaSetDevice(device_ordinal_));
host_buffer_.resize(send_size * world_size_);
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
cudaMemcpyDefault));
Allgather(host_buffer_.data(), host_buffer_.size());
dh::safe_cuda(
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
host_buffer_.resize(send_size);
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_buffer, send_size, cudaMemcpyDefault));
auto const output = Allgather(host_buffer_);
dh::safe_cuda(cudaMemcpy(receive_buffer, output.data(), output.size(), cudaMemcpyDefault));
}
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,

View File

@@ -60,11 +60,16 @@ class InMemoryCommunicator : public Communicator {
bool IsDistributed() const override { return true; }
bool IsFederated() const override { return false; }
void AllGather(void* in_out, std::size_t size) override {
std::string AllGather(std::string_view input) override {
std::string output;
handler_.Allgather(static_cast<const char*>(in_out), size, &output, sequence_number_++,
GetRank());
output.copy(static_cast<char*>(in_out), size);
handler_.Allgather(input.data(), input.size(), &output, sequence_number_++, GetRank());
return output;
}
std::string AllGatherV(std::string_view input) override {
std::string output;
handler_.AllgatherV(input.data(), input.size(), &output, sequence_number_++, GetRank());
return output;
}
void AllReduce(void* in_out, std::size_t size, DataType data_type, Operation operation) override {

View File

@@ -16,23 +16,49 @@ class AllgatherFunctor {
public:
std::string const name{"Allgather"};
AllgatherFunctor(int world_size, int rank) : world_size_{world_size}, rank_{rank} {}
AllgatherFunctor(std::size_t world_size, std::size_t rank)
: world_size_{world_size}, rank_{rank} {}
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
if (buffer->empty()) {
// Copy the input if this is the first request.
buffer->assign(input, bytes);
} else {
// Splice the input into the common buffer.
auto const per_rank = bytes / world_size_;
auto const index = rank_ * per_rank;
buffer->replace(index, per_rank, input + index, per_rank);
// Resize the buffer if this is the first request.
buffer->resize(bytes * world_size_);
}
// Splice the input into the common buffer.
buffer->replace(rank_ * bytes, bytes, input, bytes);
}
private:
std::size_t world_size_;
std::size_t rank_;
};
/**
* @brief Functor for variable-length allgather.
*/
class AllgatherVFunctor {
public:
std::string const name{"AllgatherV"};
AllgatherVFunctor(std::size_t world_size, std::size_t rank,
std::map<std::size_t, std::string_view>* data)
: world_size_{world_size}, rank_{rank}, data_{data} {}
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
data_->emplace(rank_, std::string_view{input, bytes});
if (data_->size() == world_size_) {
for (auto const& kv : *data_) {
buffer->append(kv.second);
}
data_->clear();
}
}
private:
int world_size_;
int rank_;
std::size_t world_size_;
std::size_t rank_;
std::map<std::size_t, std::string_view>* data_;
};
/**
@@ -154,7 +180,7 @@ class BroadcastFunctor {
public:
std::string const name{"Broadcast"};
BroadcastFunctor(int rank, int root) : rank_{rank}, root_{root} {}
BroadcastFunctor(std::size_t rank, std::size_t root) : rank_{rank}, root_{root} {}
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
if (rank_ == root_) {
@@ -164,11 +190,11 @@ class BroadcastFunctor {
}
private:
int rank_;
int root_;
std::size_t rank_;
std::size_t root_;
};
void InMemoryHandler::Init(int world_size, int) {
void InMemoryHandler::Init(std::size_t world_size, std::size_t) {
CHECK(world_size_ < world_size) << "In memory handler already initialized.";
std::unique_lock<std::mutex> lock(mutex_);
@@ -178,7 +204,7 @@ void InMemoryHandler::Init(int world_size, int) {
cv_.notify_all();
}
void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
void InMemoryHandler::Shutdown(uint64_t sequence_number, std::size_t) {
CHECK(world_size_ > 0) << "In memory handler already shutdown.";
std::unique_lock<std::mutex> lock(mutex_);
@@ -194,24 +220,30 @@ void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
}
void InMemoryHandler::Allgather(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank) {
std::size_t sequence_number, std::size_t rank) {
Handle(input, bytes, output, sequence_number, rank, AllgatherFunctor{world_size_, rank});
}
void InMemoryHandler::AllgatherV(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, std::size_t rank) {
Handle(input, bytes, output, sequence_number, rank, AllgatherVFunctor{world_size_, rank, &aux_});
}
void InMemoryHandler::Allreduce(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank, DataType data_type,
std::size_t sequence_number, std::size_t rank, DataType data_type,
Operation op) {
Handle(input, bytes, output, sequence_number, rank, AllreduceFunctor{data_type, op});
}
void InMemoryHandler::Broadcast(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank, int root) {
std::size_t sequence_number, std::size_t rank, std::size_t root) {
Handle(input, bytes, output, sequence_number, rank, BroadcastFunctor{rank, root});
}
template <class HandlerFunctor>
void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank, HandlerFunctor const& functor) {
std::size_t sequence_number, std::size_t rank,
HandlerFunctor const& functor) {
// Pass through if there is only 1 client.
if (world_size_ == 1) {
if (input != output->data()) {

View File

@@ -3,6 +3,7 @@
*/
#pragma once
#include <condition_variable>
#include <map>
#include <string>
#include "communicator.h"
@@ -31,7 +32,7 @@ class InMemoryHandler {
*
* This is used when the handler only needs to be initialized once with a known world size.
*/
explicit InMemoryHandler(int worldSize) : world_size_{worldSize} {}
explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}
/**
* @brief Initialize the handler with the world size and rank.
@@ -41,7 +42,7 @@ class InMemoryHandler {
* This is used when multiple objects/threads are accessing the same handler and need to
* initialize it collectively.
*/
void Init(int world_size, int rank);
void Init(std::size_t world_size, std::size_t rank);
/**
* @brief Shut down the handler.
@@ -51,7 +52,7 @@ class InMemoryHandler {
* This is used when multiple objects/threads are accessing the same handler and need to
* shut it down collectively.
*/
void Shutdown(uint64_t sequence_number, int rank);
void Shutdown(uint64_t sequence_number, std::size_t rank);
/**
* @brief Perform allgather.
@@ -62,7 +63,18 @@ class InMemoryHandler {
* @param rank Index of the worker.
*/
void Allgather(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank);
std::size_t sequence_number, std::size_t rank);
/**
* @brief Perform variable-length allgather.
* @param input The input buffer.
* @param bytes Number of bytes in the input buffer.
* @param output The output buffer.
* @param sequence_number Call sequence number.
* @param rank Index of the worker.
*/
void AllgatherV(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, std::size_t rank);
/**
* @brief Perform allreduce.
@@ -75,7 +87,7 @@ class InMemoryHandler {
* @param op The reduce operation.
*/
void Allreduce(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank, DataType data_type, Operation op);
std::size_t sequence_number, std::size_t rank, DataType data_type, Operation op);
/**
* @brief Perform broadcast.
@@ -87,7 +99,7 @@ class InMemoryHandler {
* @param root Index of the worker to broadcast from.
*/
void Broadcast(char const* input, std::size_t bytes, std::string* output,
std::size_t sequence_number, int rank, int root);
std::size_t sequence_number, std::size_t rank, std::size_t root);
private:
/**
@@ -102,15 +114,16 @@ class InMemoryHandler {
*/
template <class HandlerFunctor>
void Handle(char const* input, std::size_t size, std::string* output, std::size_t sequence_number,
int rank, HandlerFunctor const& functor);
std::size_t rank, HandlerFunctor const& functor);
int world_size_{}; /// Number of workers.
int received_{}; /// Number of calls received with the current sequence.
int sent_{}; /// Number of calls completed with the current sequence.
std::string buffer_{}; /// A shared common buffer.
uint64_t sequence_number_{}; /// Call sequence number.
mutable std::mutex mutex_; /// Lock.
mutable std::condition_variable cv_; /// Conditional variable to wait on.
std::size_t world_size_{}; /// Number of workers.
std::size_t received_{}; /// Number of calls received with the current sequence.
std::size_t sent_{}; /// Number of calls completed with the current sequence.
std::string buffer_{}; /// A shared common buffer.
std::map<std::size_t, std::string_view> aux_{}; /// A shared auxiliary map.
uint64_t sequence_number_{}; /// Call sequence number.
mutable std::mutex mutex_; /// Lock.
mutable std::condition_variable cv_; /// Conditional variable to wait on.
};
} // namespace collective

167
src/collective/loop.cc Normal file
View File

@@ -0,0 +1,167 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include "loop.h"
#include <queue> // for queue
#include "rabit/internal/socket.h" // for PollHelper
#include "xgboost/collective/socket.h" // for FailWithCode
#include "xgboost/logging.h" // for CHECK
namespace xgboost::collective {
Result Loop::EmptyQueue() {
timer_.Start(__func__);
auto error = [this] {
this->stop_ = true;
timer_.Stop(__func__);
};
while (!queue_.empty() && !stop_) {
std::queue<Op> qcopy;
rabit::utils::PollHelper poll;
// watch all ops
while (!queue_.empty()) {
auto op = queue_.front();
queue_.pop();
switch (op.code) {
case Op::kRead: {
poll.WatchRead(*op.sock);
break;
}
case Op::kWrite: {
poll.WatchWrite(*op.sock);
break;
}
default: {
error();
return Fail("Invalid socket operation.");
}
}
qcopy.push(op);
}
// poll, work on fds that are ready.
timer_.Start("poll");
auto rc = poll.Poll(timeout_);
timer_.Stop("poll");
if (!rc.OK()) {
error();
return rc;
}
// we wonldn't be here if the queue is empty.
CHECK(!qcopy.empty());
while (!qcopy.empty() && !stop_) {
auto op = qcopy.front();
qcopy.pop();
std::int32_t n_bytes_done{0};
CHECK(op.sock->NonBlocking());
switch (op.code) {
case Op::kRead: {
if (poll.CheckRead(*op.sock)) {
n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
}
break;
}
case Op::kWrite: {
if (poll.CheckWrite(*op.sock)) {
n_bytes_done = op.sock->Send(op.ptr + op.off, op.n - op.off);
}
break;
}
default: {
error();
return Fail("Invalid socket operation.");
}
}
if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
stop_ = true;
auto rc = system::FailWithCode("Invalid socket output.");
error();
return rc;
}
op.off += n_bytes_done;
CHECK_LE(op.off, op.n);
if (op.off != op.n) {
// not yet finished, push back to queue for next round.
queue_.push(op);
}
}
}
timer_.Stop(__func__);
return Success();
}
void Loop::Process() {
// consumer
while (true) {
std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
if (stop_) {
break;
}
CHECK(!mu_.try_lock());
this->rc_ = this->EmptyQueue();
if (!rc_.OK()) {
stop_ = true;
cv_.notify_one();
break;
}
CHECK(queue_.empty());
CHECK(!mu_.try_lock());
cv_.notify_one();
}
if (rc_.OK()) {
CHECK(queue_.empty());
}
}
Result Loop::Stop() {
std::unique_lock lock{mu_};
stop_ = true;
lock.unlock();
CHECK_EQ(this->Block().OK(), this->rc_.OK());
if (curr_exce_) {
std::rethrow_exception(curr_exce_);
}
return Success();
}
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
timer_.Init(__func__);
worker_ = std::thread{[this] {
try {
this->Process();
} catch (std::exception const& e) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
} catch (...) {
std::lock_guard<std::mutex> guard{mu_};
if (!curr_exce_) {
curr_exce_ = std::current_exception();
rc_ = Fail("Exception was thrown");
}
stop_ = true;
cv_.notify_all();
}
}};
}
} // namespace xgboost::collective

83
src/collective/loop.h Normal file
View File

@@ -0,0 +1,83 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <chrono> // for seconds
#include <condition_variable> // for condition_variable
#include <cstddef> // for size_t
#include <cstdint> // for int8_t, int32_t
#include <exception> // for exception_ptr
#include <mutex> // for unique_lock, mutex
#include <queue> // for queue
#include <thread> // for thread
#include <utility> // for move
#include "../common/timer.h" // for Monitor
#include "xgboost/collective/result.h" // for Result
#include "xgboost/collective/socket.h" // for TCPSocket
namespace xgboost::collective {
class Loop {
public:
struct Op {
enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
std::int32_t rank{-1};
std::int8_t* ptr{nullptr};
std::size_t n{0};
TCPSocket* sock{nullptr};
std::size_t off{0};
Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
: code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
Op(Op const&) = default;
Op& operator=(Op const&) = default;
Op(Op&&) = default;
Op& operator=(Op&&) = default;
};
private:
std::thread worker_;
std::condition_variable cv_;
std::mutex mu_;
std::queue<Op> queue_;
std::chrono::seconds timeout_;
Result rc_;
bool stop_{false};
std::exception_ptr curr_exce_{nullptr};
common::Monitor timer_;
Result EmptyQueue();
void Process();
public:
Result Stop();
void Submit(Op op) {
// producer
std::unique_lock lock{mu_};
queue_.push(op);
lock.unlock();
cv_.notify_one();
}
[[nodiscard]] Result Block() {
{
std::unique_lock lock{mu_};
cv_.notify_all();
}
std::unique_lock lock{mu_};
cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
return std::move(rc_);
}
explicit Loop(std::chrono::seconds timeout);
~Loop() noexcept(false) {
this->Stop();
if (worker_.joinable()) {
worker_.join();
}
}
};
} // namespace xgboost::collective

View File

@@ -17,10 +17,11 @@ class NoOpCommunicator : public Communicator {
NoOpCommunicator() : Communicator(1, 0) {}
bool IsDistributed() const override { return false; }
bool IsFederated() const override { return false; }
void AllGather(void *, std::size_t) override {}
std::string AllGather(std::string_view) override { return {}; }
std::string AllGatherV(std::string_view) override { return {}; }
void AllReduce(void *, std::size_t, DataType, Operation) override {}
void Broadcast(void *, std::size_t, int) override {}
std::string GetProcessorName() override { return ""; }
std::string GetProcessorName() override { return {}; }
void Print(const std::string &message) override { LOG(CONSOLE) << message; }
protected:

214
src/collective/protocol.h Normal file
View File

@@ -0,0 +1,214 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <cstdint> // for int32_t
#include <string> // for string
#include <utility> // for move
#include "xgboost/collective/result.h" // for Result
#include "xgboost/collective/socket.h" // for TCPSocket
#include "xgboost/json.h" // for Json
namespace xgboost::collective::proto {
struct PeerInfo {
std::string host;
std::int32_t port{-1};
std::int32_t rank{-1};
PeerInfo() = default;
PeerInfo(std::string host, std::int32_t port, std::int32_t rank)
: host{std::move(host)}, port{port}, rank{rank} {}
explicit PeerInfo(Json const& peer)
: host{get<String>(peer["host"])},
port{static_cast<std::int32_t>(get<Integer const>(peer["port"]))},
rank{static_cast<std::int32_t>(get<Integer const>(peer["rank"]))} {}
[[nodiscard]] Json ToJson() const {
Json info{Object{}};
info["rank"] = rank;
info["host"] = String{host};
info["port"] = Integer{port};
return info;
}
[[nodiscard]] auto HostPort() const { return host + ":" + std::to_string(this->port); }
};
struct Magic {
static constexpr std::int32_t kMagic = 0xff99;
[[nodiscard]] Result Verify(xgboost::collective::TCPSocket* p_sock) {
std::int32_t magic{kMagic};
auto n_bytes = p_sock->SendAll(&magic, sizeof(magic));
if (n_bytes != sizeof(magic)) {
return Fail("Failed to verify.");
}
magic = 0;
n_bytes = p_sock->RecvAll(&magic, sizeof(magic));
if (n_bytes != sizeof(magic)) {
return Fail("Failed to verify.");
}
if (magic != kMagic) {
return xgboost::collective::Fail("Invalid verification number.");
}
return Success();
}
};
enum class CMD : std::int32_t {
kInvalid = 0,
kStart = 1,
kShutdown = 2,
kError = 3,
kPrint = 4,
};
struct Connect {
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::int32_t world, std::int32_t rank,
std::string task_id) const {
Json jinit{Object{}};
jinit["world_size"] = Integer{world};
jinit["rank"] = Integer{rank};
jinit["task_id"] = String{task_id};
std::string msg;
Json::Dump(jinit, &msg);
auto n_bytes = tracker->Send(msg);
if (n_bytes != msg.size()) {
return Fail("Failed to send init command from worker.");
}
return Success();
}
[[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
std::string* task_id) const {
std::string init;
sock->Recv(&init);
auto jinit = Json::Load(StringView{init});
*world = get<Integer const>(jinit["world_size"]);
*rank = get<Integer const>(jinit["rank"]);
*task_id = get<String const>(jinit["task_id"]);
return Success();
}
};
class Start {
private:
[[nodiscard]] Result TrackerSend(std::int32_t world, TCPSocket* worker) const {
Json jcmd{Object{}};
jcmd["world_size"] = Integer{world};
auto scmd = Json::Dump(jcmd);
auto n_bytes = worker->Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Failed to send init command from tracker.");
}
return Success();
}
public:
[[nodiscard]] Result WorkerSend(std::int32_t lport, TCPSocket* tracker,
std::int32_t eport) const {
Json jcmd{Object{}};
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kStart)};
jcmd["port"] = Integer{lport};
jcmd["error_port"] = Integer{eport};
auto scmd = Json::Dump(jcmd);
auto n_bytes = tracker->Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Failed to send init command from worker.");
}
return Success();
}
[[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
std::string scmd;
auto n_bytes = tracker->Recv(&scmd);
if (n_bytes <= 0) {
return Fail("Failed to recv init command from tracker.");
}
auto jcmd = Json::Load(scmd);
auto world = get<Integer const>(jcmd["world_size"]);
if (world <= 0) {
return Fail("Invalid world size.");
}
*p_world = world;
return Success();
}
[[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
std::int32_t* p_port, TCPSocket* p_sock,
std::int32_t* eport) const {
*p_port = get<Integer const>(jcmd["port"]);
if (*p_port <= 0) {
return Fail("Invalid port.");
}
if (*recv_world != -1) {
return Fail("Invalid initialization sequence.");
}
*recv_world = world;
*eport = get<Integer const>(jcmd["error_port"]);
return TrackerSend(world, p_sock);
}
};
struct Print {
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
Json jcmd{Object{}};
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kPrint)};
jcmd["msg"] = String{std::move(msg)};
auto scmd = Json::Dump(jcmd);
auto n_bytes = tracker->Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Failed to send print command from worker.");
}
return Success();
}
[[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg) const {
if (!IsA<String>(jcmd["msg"])) {
return Fail("Invalid print command.");
}
auto msg = get<String const>(jcmd["msg"]);
*p_msg = msg;
return Success();
}
};
struct ErrorCMD {
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
auto msg = res.Report();
auto code = res.Code().value();
Json jcmd{Object{}};
jcmd["msg"] = String{std::move(msg)};
jcmd["code"] = Integer{code};
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kError)};
auto scmd = Json::Dump(jcmd);
auto n_bytes = tracker->Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Failed to send error command from worker.");
}
return Success();
}
[[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg, int* p_code) const {
if (!IsA<String>(jcmd["msg"]) || !IsA<Integer>(jcmd["code"])) {
return Fail("Invalid error command.");
}
auto msg = get<String const>(jcmd["msg"]);
auto code = get<Integer const>(jcmd["code"]);
*p_msg = msg;
*p_code = code;
return Success();
}
};
struct ShutdownCMD {
[[nodiscard]] Result Send(TCPSocket* peer) const {
Json jcmd{Object{}};
jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
auto scmd = Json::Dump(jcmd);
auto n_bytes = peer->Send(scmd);
if (n_bytes != scmd.size()) {
return Fail("Failed to send shutdown command from worker.");
}
return Success();
}
};
} // namespace xgboost::collective::proto

View File

@@ -7,6 +7,7 @@
#include <string>
#include <vector>
#include "communicator-inl.h"
#include "communicator.h"
#include "xgboost/json.h"
@@ -55,10 +56,29 @@ class RabitCommunicator : public Communicator {
bool IsFederated() const override { return false; }
void AllGather(void *send_receive_buffer, std::size_t size) override {
auto const per_rank = size / GetWorldSize();
std::string AllGather(std::string_view input) override {
auto const per_rank = input.size();
auto const total_size = per_rank * GetWorldSize();
auto const index = per_rank * GetRank();
rabit::Allgather(static_cast<char *>(send_receive_buffer), size, index, per_rank, per_rank);
std::string result(total_size, '\0');
result.replace(index, per_rank, input);
rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
return result;
}
std::string AllGatherV(std::string_view input) override {
auto const size_node_slice = input.size();
auto const all_sizes = collective::Allgather(size_node_slice);
auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
auto const begin_index =
std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
auto const size_prev_slice =
GetRank() == 0 ? all_sizes[GetWorldSize() - 1] : all_sizes[GetRank() - 1];
std::string result(total_size, '\0');
result.replace(begin_index, size_node_slice, input);
rabit::Allgather(result.data(), total_size, begin_index, size_node_slice, size_prev_slice);
return result;
}
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,

View File

@@ -3,6 +3,7 @@
*/
#include "xgboost/collective/socket.h"
#include <array> // for array
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <cstring> // std::memcpy, std::memset
@@ -92,13 +93,18 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
conn = TCPSocket::Create(addr.Domain());
CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
conn.SetNonBlock(true);
auto non_blocking = conn.NonBlocking();
auto rc = conn.NonBlocking(true);
if (!rc.OK()) {
return Fail("Failed to set socket option.", std::move(rc));
}
Result last_error;
auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
auto log_failure = [&host, &last_error, port](Result err, char const *file, std::int32_t line) {
last_error = std::move(err);
LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
<< "): Failed to connect to:" << host << " Error:" << last_error.Report();
<< "): Failed to connect to:" << host << ":" << port
<< " Error:" << last_error.Report();
};
for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
@@ -112,39 +118,42 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
}
auto rc = connect(conn.Handle(), addr_handle, addr_len);
if (rc != 0) {
auto errcode = system::LastError();
if (!system::ErrorWouldBlock(errcode)) {
log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
__FILE__, __LINE__);
continue;
}
rabit::utils::PollHelper poll;
poll.WatchWrite(conn);
auto result = poll.Poll(timeout);
if (!result.OK()) {
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
if (!poll.CheckWrite(conn)) {
log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
__FILE__, __LINE__);
continue;
}
result = conn.GetSockError();
if (!result.OK()) {
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
conn.SetNonBlock(false);
return Success();
} else {
conn.SetNonBlock(false);
return Success();
if (rc == 0) {
return conn.NonBlocking(non_blocking);
}
auto errcode = system::LastError();
if (!system::ErrorWouldBlock(errcode)) {
log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
__FILE__, __LINE__);
continue;
}
rabit::utils::PollHelper poll;
poll.WatchWrite(conn);
auto result = poll.Poll(timeout);
if (!result.OK()) {
// poll would fail if there's a socket error, we log the root cause instead of the
// poll failure.
auto sockerr = conn.GetSockError();
if (!sockerr.OK()) {
result = std::move(sockerr);
}
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
if (!poll.CheckWrite(conn)) {
log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}), __FILE__,
__LINE__);
continue;
}
result = conn.GetSockError();
if (!result.OK()) {
log_failure(std::move(result), __FILE__, __LINE__);
continue;
}
return conn.NonBlocking(non_blocking);
}
std::stringstream ss;
@@ -152,4 +161,13 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
conn.Close();
return Fail(ss.str(), std::move(last_error));
}
[[nodiscard]] Result GetHostName(std::string *p_out) {
std::array<char, HOST_NAME_MAX> buf;
if (gethostname(&buf[0], HOST_NAME_MAX) != 0) {
return system::FailWithCode("Failed to get host name.");
}
*p_out = buf.data();
return Success();
}
} // namespace xgboost::collective

296
src/collective/tracker.cc Normal file
View File

@@ -0,0 +1,296 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#if defined(__unix__) || defined(__APPLE__)
#include <netdb.h> // gethostbyname
#include <sys/socket.h> // socket, AF_INET6, AF_INET, connect, getsockname
#endif // defined(__unix__) || defined(__APPLE__)
#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)
#if defined(_WIN32)
#include <winsock2.h>
#include <ws2tcpip.h>
#endif // defined(_WIN32)
#include <algorithm> // for sort
#include <chrono> // for seconds
#include <cstdint> // for int32_t
#include <string> // for string
#include <utility> // for move, forward
#include "../common/json_utils.h"
#include "comm.h"
#include "protocol.h" // for kMagic, PeerInfo
#include "tracker.h"
#include "xgboost/collective/result.h" // for Result, Fail, Success
#include "xgboost/collective/socket.h" // for GetHostName, FailWithCode, MakeSockAddress, ...
#include "xgboost/json.h"
namespace xgboost::collective {
Tracker::Tracker(Json const& config)
: n_workers_{static_cast<std::int32_t>(
RequiredArg<Integer const>(config, "n_workers", __func__))},
port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
timeout_{std::chrono::seconds{OptionalArg<Integer const>(
config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
: sock_{std::move(sock)} {
auto host = addr.Addr();
std::int32_t rank{0};
rc_ = Success()
<< [&] { return proto::Magic{}.Verify(&sock_); }
<< [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
if (!rc_.OK()) {
return;
}
std::string cmd;
sock_.Recv(&cmd);
auto jcmd = Json::Load(StringView{cmd});
cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
std::int32_t port{0};
if (cmd_ == proto::CMD::kStart) {
proto::Start start;
rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
} else if (cmd_ == proto::CMD::kPrint) {
proto::Print print;
rc_ = print.TrackerHandle(jcmd, &msg_);
} else if (cmd_ == proto::CMD::kError) {
proto::ErrorCMD error;
rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
}
if (!rc_.OK()) {
return;
}
info_ = proto::PeerInfo{host, port, rank};
}
RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
std::string self;
auto rc = collective::GetHostAddress(&self);
auto host = OptionalArg<String>(config, "host", self);
listener_ = TCPSocket::Create(SockDomain::kV4);
rc = listener_.Bind(host, &this->port_);
CHECK(rc.OK()) << rc.Report();
listener_.Listen();
}
Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
auto& workers = *p_workers;
std::sort(workers.begin(), workers.end(), WorkerCmp{});
std::vector<std::thread> bootstrap_threads;
for (std::int32_t r = 0; r < n_workers_; ++r) {
auto& worker = workers[r];
auto next = BootstrapNext(r, n_workers_);
auto const& next_w = workers[next];
bootstrap_threads.emplace_back([next, &worker, &next_w] {
auto jnext = proto::PeerInfo{next_w.Host(), next_w.Port(), next}.ToJson();
std::string str;
Json::Dump(jnext, &str);
worker.Send(StringView{str});
});
}
for (auto& t : bootstrap_threads) {
t.join();
}
for (auto const& w : workers) {
worker_error_handles_.emplace_back(w.Host(), w.ErrorPort());
}
return Success();
}
[[nodiscard]] std::future<Result> RabitTracker::Run() {
// a state machine to keep track of consistency.
struct State {
std::int32_t const n_workers;
std::int32_t n_shutdown{0};
bool during_restart{false};
std::vector<WorkerProxy> pending;
explicit State(std::int32_t world) : n_workers{world} {}
State(State const& that) = delete;
State& operator=(State&& that) = delete;
void Start(WorkerProxy&& worker) {
CHECK_LT(pending.size(), n_workers);
CHECK_LE(n_shutdown, n_workers);
pending.emplace_back(std::forward<WorkerProxy>(worker));
CHECK_LE(pending.size(), n_workers);
}
void Shutdown() {
CHECK_GE(n_shutdown, 0);
CHECK_LT(n_shutdown, n_workers);
++n_shutdown;
CHECK_LE(n_shutdown, n_workers);
}
void Error() {
CHECK_LE(pending.size(), n_workers);
CHECK_LE(n_shutdown, n_workers);
during_restart = true;
}
[[nodiscard]] bool Ready() const {
CHECK_LE(pending.size(), n_workers);
return static_cast<std::int32_t>(pending.size()) == n_workers;
}
void Bootstrap() {
CHECK_EQ(pending.size(), n_workers);
CHECK_LE(n_shutdown, n_workers);
// A reset.
n_shutdown = 0;
during_restart = false;
pending.clear();
}
[[nodiscard]] bool ShouldContinue() const {
CHECK_LE(pending.size(), n_workers);
CHECK_LE(n_shutdown, n_workers);
// - Without error, we should shutdown after all workers are offline.
// - With error, all workers are offline, and we have during_restart as true.
return n_shutdown != n_workers || during_restart;
}
};
return std::async(std::launch::async, [this] {
State state{this->n_workers_};
while (state.ShouldContinue()) {
TCPSocket sock;
SockAddrV4 addr;
auto rc = listener_.Accept(&sock, &addr);
if (!rc.OK()) {
return Fail("Failed to accept connection.", std::move(rc));
}
auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
if (!worker.Status().OK()) {
return Fail("Failed to initialize worker proxy.", std::move(worker.Status()));
}
switch (worker.Command()) {
case proto::CMD::kStart: {
state.Start(std::move(worker));
if (state.Ready()) {
rc = this->Bootstrap(&state.pending);
state.Bootstrap();
}
if (!rc.OK()) {
return rc;
}
continue;
}
case proto::CMD::kShutdown: {
state.Shutdown();
continue;
}
case proto::CMD::kError: {
if (state.during_restart) {
continue;
}
state.Error();
auto msg = worker.Msg();
auto code = worker.Code();
LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
<< "]: " << msg << " code:" << code;
auto host = worker.Host();
// We signal all workers for the error, if they haven't aborted already.
for (auto& w : worker_error_handles_) {
if (w.first == host) {
continue;
}
TCPSocket out;
// retry is set to 1, just let the worker timeout or error. Otherwise the
// tracker and the worker might be waiting for each other.
auto rc = Connect(w.first, w.second, 1, timeout_, &out);
// send signal to stop the worker.
proto::ShutdownCMD shutdown;
rc = shutdown.Send(&out);
if (!rc.OK()) {
return Fail("Failed to inform workers to stop.");
}
}
continue;
}
case proto::CMD::kPrint: {
LOG(CONSOLE) << worker.Msg();
continue;
}
case proto::CMD::kInvalid:
default: {
return Fail("Invalid command received.");
}
}
}
return Success();
});
}
[[nodiscard]] Result GetHostAddress(std::string* out) {
auto rc = GetHostName(out);
if (!rc.OK()) {
return rc;
}
auto host = gethostbyname(out->c_str());
// get ip address from host
std::string ip;
rc = INetNToP(host, &ip);
if (!rc.OK()) {
return rc;
}
if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
// return if this is a public IP address.
// not entirely accurate, we have other reserved IPs
*out = ip;
return Success();
}
// Create an UDP socket to prob the public IP address, it's fine even if it's
// unreachable.
auto sock = socket(AF_INET, SOCK_DGRAM, 0);
if (sock == -1) {
return Fail("Failed to create socket.");
}
auto paddr = MakeSockAddress(StringView{"10.255.255.255"}, 1);
sockaddr const* addr_handle = reinterpret_cast<const sockaddr*>(&paddr.V4().Handle());
socklen_t addr_len{sizeof(paddr.V4().Handle())};
auto err = connect(sock, addr_handle, addr_len);
if (err != 0) {
return system::FailWithCode("Failed to find IP address.");
}
// get the IP address from socket desrciptor
struct sockaddr_in addr;
socklen_t len = sizeof(addr);
if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
return Fail("Failed to get sock name.");
}
ip = inet_ntoa(addr.sin_addr);
err = system::CloseSocket(sock);
if (err != 0) {
return system::FailWithCode("Failed to close socket.");
}
*out = ip;
return Success();
}
} // namespace xgboost::collective

141
src/collective/tracker.h Normal file
View File

@@ -0,0 +1,141 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#pragma once
#include <chrono> // for seconds
#include <cstdint> // for int32_t
#include <future> // for future
#include <string> // for string
#include <utility> // for pair
#include <vector> // for vector
#include "protocol.h"
#include "xgboost/collective/result.h" // for Result
#include "xgboost/collective/socket.h" // for TCPSocket
#include "xgboost/json.h" // for Json
namespace xgboost::collective {
/**
*
* @brief Implementation of RABIT tracker.
*
* * What is a tracker
*
* The implementation of collective follows what RABIT did in the past. It requires a
* tracker to coordinate initialization and error recovery of workers. While the
* original implementation attempted to attain error resislient inside the collective
* module, which turned out be too challenging due to large amount of external
* states. The new implementation here differs from RABIT in the way that neither state
* recovery nor resislient is handled inside the collective, it merely provides the
* mechanism to signal error to other workers through the use of a centralized tracker.
*
* There are three major functionalities provided the a tracker, namely:
* - Initialization. Share the node addresses among all workers.
* - Logging.
* - Signal error. If an exception is thrown in one (or many) of the workers, it can
* signal an error to the tracker and the tracker will notify other workers.
*/
class Tracker {
protected:
std::int32_t n_workers_{0};
std::int32_t port_{-1};
std::chrono::seconds timeout_{0};
public:
explicit Tracker(Json const& config);
Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
: n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
virtual ~Tracker() noexcept(false){}; // NOLINT
[[nodiscard]] virtual std::future<Result> Run() = 0;
[[nodiscard]] virtual Json WorkerArgs() const = 0;
[[nodiscard]] std::chrono::seconds Timeout() const { return timeout_; }
};
class RabitTracker : public Tracker {
// a wrapper for connected worker socket.
class WorkerProxy {
TCPSocket sock_;
proto::PeerInfo info_;
std::int32_t eport_{0};
std::int32_t world_{-1};
std::string task_id_;
proto::CMD cmd_{proto::CMD::kInvalid};
std::string msg_;
std::int32_t code_{0};
Result rc_;
public:
explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
WorkerProxy(WorkerProxy const& that) = delete;
WorkerProxy(WorkerProxy&& that) = default;
WorkerProxy& operator=(WorkerProxy const&) = delete;
WorkerProxy& operator=(WorkerProxy&&) = default;
[[nodiscard]] auto Host() const { return info_.host; }
[[nodiscard]] auto TaskID() const { return task_id_; }
[[nodiscard]] auto Port() const { return info_.port; }
[[nodiscard]] auto Rank() const { return info_.rank; }
[[nodiscard]] auto ErrorPort() const { return eport_; }
[[nodiscard]] auto Command() const { return cmd_; }
[[nodiscard]] auto Msg() const { return msg_; }
[[nodiscard]] auto Code() const { return code_; }
[[nodiscard]] Result const& Status() const { return rc_; }
[[nodiscard]] Result& Status() { return rc_; }
void Send(StringView value) { this->sock_.Send(value); }
};
// provide an ordering for workers, this helps us get deterministic topology.
struct WorkerCmp {
[[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
auto const& lh = lhs.Host();
auto const& rh = rhs.Host();
if (lh != rh) {
return lh < rh;
}
return lhs.TaskID() < rhs.TaskID();
}
};
private:
std::string host_;
// record for how to reach out to workers if error happens.
std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
// listening socket for incoming workers.
TCPSocket listener_;
Result Bootstrap(std::vector<WorkerProxy>* p_workers);
public:
explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
std::chrono::seconds timeout)
: Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
listener_ = TCPSocket::Create(SockDomain::kV4);
auto rc = listener_.Bind(host, &this->port_);
CHECK(rc.OK()) << rc.Report();
listener_.Listen();
}
explicit RabitTracker(Json const& config);
~RabitTracker() noexcept(false) override = default;
std::future<Result> Run() override;
[[nodiscard]] std::int32_t Port() const { return port_; }
[[nodiscard]] Json WorkerArgs() const override {
Json args{Object{}};
args["DMLC_TRACKER_URI"] = String{host_};
args["DMLC_TRACKER_PORT"] = this->Port();
return args;
}
};
// Prob the public IP address of the host, need a better method.
//
// This is directly translated from the previous Python implementation, we should find a
// more riguous approach, can use some expertise in network programming.
[[nodiscard]] Result GetHostAddress(std::string* out);
} // namespace xgboost::collective

View File

@@ -5,17 +5,16 @@
#ifndef XGBOOST_COMMON_BITFIELD_H_
#define XGBOOST_COMMON_BITFIELD_H_
#include <algorithm>
#include <bitset>
#include <cinttypes>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <algorithm> // for min
#include <bitset> // for bitset
#include <cstdint> // for uint32_t, uint64_t, uint8_t
#include <ostream> // for ostream
#include <type_traits> // for conditional_t, is_signed_v
#if defined(__CUDACC__)
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include "device_helpers.cuh"
#elif defined(__HIP_PLATFORM_AMD__)
#include <thrust/copy.h>
@@ -23,8 +22,8 @@
#include "device_helpers.hip.h"
#endif // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#include "xgboost/span.h"
#include "common.h"
#include "xgboost/span.h" // for Span
namespace xgboost {
@@ -79,7 +78,7 @@ struct BitFieldContainer {
private:
value_type* bits_{nullptr};
size_type n_values_{0};
static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
static_assert(!std::is_signed_v<VT>, "Must use an unsiged type as the underlying storage.");
public:
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -244,11 +243,39 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
// Format: <Const><Direction>BitField<size of underlying type in bits>, underlying type
// must be unsigned.
using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
using LBitField64 = BitFieldContainer<std::uint64_t, LBitsPolicy<std::uint64_t>>;
using RBitField8 = BitFieldContainer<std::uint8_t, RBitsPolicy<unsigned char>>;
using LBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t>>;
using CLBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t, true>, true>;
using LBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t>>;
using CLBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t, true>, true>;
using RBitField32 = BitFieldContainer<std::uint32_t, RBitsPolicy<std::uint32_t>>;
namespace detail {
inline std::uint32_t TrailingZeroBitsImpl(std::uint32_t value) {
auto n = sizeof(value) * 8;
std::uint32_t cnt{0};
for (decltype(n) i = 0; i < n; i++) {
if ((value >> i) & 1) {
break;
}
cnt++;
}
return cnt;
}
} // namespace detail
inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
if (value == 0) {
return sizeof(value) * 8;
}
#if defined(__GNUC__)
return __builtin_ctz(value);
#elif defined(_MSC_VER)
return _tzcnt_u32(value);
#else
return detail::TrailingZeroBitsImpl(value);
#endif // __GNUC__
}
} // namespace xgboost
#endif // XGBOOST_COMMON_BITFIELD_H_

View File

@@ -6,7 +6,6 @@
#ifndef XGBOOST_COMMON_COMMON_H_
#define XGBOOST_COMMON_COMMON_H_
#include <algorithm> // for max
#include <array> // for array
#include <cmath> // for ceil
#include <cstddef> // for size_t
@@ -203,7 +202,7 @@ inline void SetDevice(std::int32_t device) {
#endif
/**
* Last index of a group in a CSR style of index pointer.
* @brief Last index of a group in a CSR style of index pointer.
*/
template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {

View File

@@ -135,7 +135,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
#endif
}
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -252,13 +252,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
});
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) {
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
&column_sizes_scan);
}
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
HistogramCuts cuts;
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
ctx->Ordinal());
ctx->Device());
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
std::size_t page_nnz = page.data.Size();

View File

@@ -86,9 +86,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
}
template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0;
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem));
@@ -110,11 +110,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
* \param out_column_size Output buffer for the size of each column.
*/
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
// Not strictly correct as we should use number of samples to determine the type of
// counter. However, the sample size is not known due to sliding window on number of
// elements.
@@ -158,7 +158,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
}
template <typename BatchIt>
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -228,7 +228,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
// Count the valid entries in each column and copy them out.
template <typename AdapterBatch, typename BatchIter>
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
float missing, size_t columns, size_t cuts_per_feature, int device,
float missing, size_t columns, size_t cuts_per_feature,
DeviceOrd device,
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
dh::caching_device_vector<size_t>* column_sizes_scan,
dh::device_vector<Entry>* sorted_entries) {
@@ -252,7 +253,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -290,7 +291,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
int device, size_t columns, size_t begin, size_t end,
DeviceOrd device, size_t columns, size_t begin, size_t end,
float missing, SketchContainer *sketch_container,
int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort
@@ -335,11 +336,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
template <typename Batch>
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
int num_cuts_per_feature,
bool is_ranking, float missing, int device,
bool is_ranking, float missing, DeviceOrd device,
size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc;
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan();
@@ -451,14 +452,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
size_t num_rows = batch.NumRows();
size_t num_cols = batch.NumCols();
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
int32_t device = sketch_container->DeviceIdx();
auto device = sketch_container->DeviceIdx();
bool weighted = !info.weights_.Empty();
if (weighted) {
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, true);
device.ordinal, num_cuts_per_feature, true);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -471,7 +472,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, false);
device.ordinal, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));

View File

@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
};
template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
: impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init);
}
@@ -81,7 +81,7 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
template <typename T>
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
return false;
}
template <typename T>
void HostDeviceVector<T>::SetDevice(int) const {}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
@@ -178,6 +175,7 @@ template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<GradientPairPrecise>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t

View File

@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
template <typename T>
class HostDeviceVectorImpl {
public:
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
if (device >= 0) {
HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
// Initializer can be std::vector<T> or std::initializer_list<T>
template <class Initializer>
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
if (device >= 0) {
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
LazyResizeDevice(init.size());
Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
gpu_access_{that.gpu_access_} {}
~HostDeviceVectorImpl() {
if (device_ >= 0) {
if (device_.IsCUDA()) {
SetDevice();
}
}
size_t Size() const {
[[nodiscard]] size_t Size() const {
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
}
int DeviceIdx() const { return device_; }
[[nodiscard]] DeviceOrd Device() const { return device_; }
T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite);
@@ -138,8 +138,7 @@ class HostDeviceVectorImpl {
} else {
auto ptr = other->ConstDevicePointer();
SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
CHECK_EQ(this->Device(), other->Device());
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
@@ -157,24 +156,25 @@ class HostDeviceVectorImpl {
return data_h_;
}
void SetDevice(int device) {
void SetDevice(DeviceOrd device) {
if (device_ == device) { return; }
if (device_ >= 0) {
if (device_.IsCUDA()) {
LazySyncHost(GPUAccess::kNone);
}
if (device_ >= 0 && device >= 0) {
CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
if (device_.IsCUDA() && device.IsCUDA()) {
CHECK_EQ(device_.ordinal, device.ordinal)
<< "New device ordinal is different from previous one.";
}
device_ = device;
if (device_ >= 0) {
if (device_.IsCUDA()) {
LazyResizeDevice(data_h_.size());
}
}
void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; }
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
// fast on-device resize
gpu_access_ = GPUAccess::kWrite;
SetDevice();
@@ -221,16 +221,16 @@ class HostDeviceVectorImpl {
gpu_access_ = access;
}
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
GPUAccess Access() const { return gpu_access_; }
[[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
[[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
[[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
[[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
[[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
[[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
[[nodiscard]] GPUAccess Access() const { return gpu_access_; }
private:
int device_{-1};
DeviceOrd device_{DeviceOrd::CPU()};
std::vector<T> data_h_{};
std::unique_ptr<dh::device_vector<T>> data_d_{};
GPUAccess gpu_access_{GPUAccess::kNone};
@@ -264,11 +264,11 @@ class HostDeviceVectorImpl {
}
void SetDevice() {
CHECK_GE(device_, 0);
CHECK_GE(device_.ordinal, 0);
if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
} else {
(*cudaSetDeviceHandler)(device_);
(*cudaSetDeviceHandler)(device_.ordinal);
}
if (!data_d_) {
@@ -278,15 +278,15 @@ class HostDeviceVectorImpl {
};
template<typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T>
@@ -314,7 +314,9 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
DeviceOrd HostDeviceVector<T>::Device() const {
return impl_->Device();
}
template <typename T>
T* HostDeviceVector<T>::DevicePointer() {
@@ -394,14 +396,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
return impl_->Access();
}
template <typename T>
void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}
template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
impl_->SetDevice(device.ordinal);
impl_->SetDevice(device);
}
template <typename T>
@@ -416,6 +413,7 @@ template class HostDeviceVector<GradientPair>;
template class HostDeviceVector<GradientPairPrecise>;
template class HostDeviceVector<int32_t>; // bst_node_t
template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t

View File

@@ -8,7 +8,7 @@
#define XGBOOST_COMMON_IO_H_
#include <dmlc/io.h>
#include <rabit/rabit.h>
#include <rabit/internal/io.h> // for MemoryFixSizeBuffer, MemoryBufferStream
#include <algorithm> // for min, fill_n, copy_n
#include <array> // for array
@@ -382,7 +382,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
* @param length See the `length` parameter of `mmap` for details.
*/
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
: AlignedResourceReadStream{std::shared_ptr<MmapResource>{ // NOLINT
new MmapResource{std::move(path), offset, length}}} {}
~PrivateMmapConstStream() noexcept(false) override;
};

74
src/common/json_utils.h Normal file
View File

@@ -0,0 +1,74 @@
/**
* Copyright 2023, XGBoost Contributors
*
* @brief Utils tailored for XGBoost.
*/
#pragma once
#include <string> // for string
#include <type_traits> // for enable_if_t, remove_const_t
#include "xgboost/json.h"
#include "xgboost/string_view.h" // for StringView
namespace xgboost {
namespace detail {
template <typename Head>
bool TypeCheckImpl(Json const &value) {
return IsA<Head>(value);
}
template <typename Head, typename... JT>
std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const &value) {
return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
}
template <typename Head>
std::string TypeCheckError() {
return "`" + Head{}.TypeStr() + "`";
}
template <typename Head, typename... JT>
std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
}
} // namespace detail
/**
* @brief Type check for JSON-based parameters
*
* @tparam JT Expected JSON types.
* @param value Value to be checked.
*/
template <typename... JT>
void TypeCheck(Json const &value, StringView name) {
if (!detail::TypeCheckImpl<JT...>(value)) {
LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
<< detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
<< "`";
}
}
template <typename JT>
auto const &RequiredArg(Json const &in, StringView key, StringView func) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it == obj.cend() || IsA<Null>(it->second)) {
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
}
TypeCheck<JT>(it->second, StringView{key});
return get<std::remove_const_t<JT> const>(it->second);
}
template <typename JT, typename T>
auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it != obj.cend() && !IsA<Null>(it->second)) {
TypeCheck<JT>(it->second, key);
return get<std::remove_const_t<JT> const>(it->second);
}
return dft;
}
} // namespace xgboost

View File

@@ -44,7 +44,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_
template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
}
} // namespace linalg
} // namespace xgboost

View File

@@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr)
template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
if (!ctx->IsCPU()) {
if (ctx->IsCUDA()) {
common::AssertGPUSupport();
}
ElementWiseKernelHost(t, ctx->Threads(), fn);

View File

@@ -11,13 +11,14 @@
namespace xgboost {
namespace common {
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
if (ctx->IsCPU()) {
if (ctx->IsCUDA()) {
return cuda_impl::Reduce(ctx, values);
} else {
auto const& h_values = values.ConstHostVector();
auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
static_assert(std::is_same<decltype(result), double>::value);
return result;
}
return cuda_impl::Reduce(ctx, values);
}
} // namespace common
} // namespace xgboost

View File

@@ -8,11 +8,9 @@
#include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h" // HostDeviceVector
namespace xgboost {
namespace common {
namespace cuda_impl {
namespace xgboost::common::cuda_impl {
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
values.SetDevice(ctx->gpu_id);
values.SetDevice(ctx->Device());
auto const d_values = values.ConstDeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -24,6 +22,4 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
thrust::plus<float>{});
#endif
}
} // namespace cuda_impl
} // namespace common
} // namespace xgboost
} // namespace xgboost::common::cuda_impl

View File

@@ -24,9 +24,9 @@ struct OptionalWeights {
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
HostDeviceVector<float> const& weights) {
if (ctx->IsCUDA()) {
weights.SetDevice(ctx->gpu_id);
weights.SetDevice(ctx->Device());
}
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
}
} // namespace xgboost::common
#endif // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_

View File

@@ -242,11 +242,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// summary does the output element come from) result by definition of merged rank. So we
// run it in 2 passes to obtain the merge path and then customize the standard merge
// algorithm.
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size());
CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -344,8 +343,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty();
@@ -404,7 +402,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
* pruning or merging. We preserve the first type and remove the second type.
*/
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc;
@@ -461,7 +459,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
void SketchContainer::Prune(size_t to) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -496,8 +494,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
timer_.Start(__func__);
if (this->Current().size() == 0) {
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -532,8 +529,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
}
void SketchContainer::FixError() {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current());
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -558,7 +554,7 @@ void SketchContainer::FixError() {
}
void SketchContainer::AllReduce(bool is_column_split) {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) {
return;
@@ -585,15 +581,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
auto offset = rank * d_columns_ptr.size();
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
gathered_ptrs.begin() + offset);
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
gathered_ptrs.size());
// Get the data from all workers.
std::vector<size_t> recv_lengths;
dh::caching_device_vector<char> recvbuf;
collective::AllGatherV(device_, this->Current().data().get(),
collective::AllGatherV(device_.ordinal, this->Current().data().get(),
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
collective::Synchronize(device_);
collective::Synchronize(device_.ordinal);
// Segment the received data.
auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -640,7 +636,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers.
@@ -690,21 +686,41 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
});
CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
max_values.resize(d_in_columns_ptr.size() - 1);
// In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep
// track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
#if defined(XGBOOST_USE_CUDA)
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
thrust::make_discard_iterator(), d_max_values.begin(),
thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
#elif defined(XGBOOST_USE_HIP)
thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
thrust::make_discard_iterator(), d_max_values.begin(),
thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
#endif
auto new_end = thrust::reduce_by_key(
thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
d_max_keys.erase(new_end.first, d_max_keys.end());
d_max_values.erase(new_end.second, d_max_values.end());
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
// The device vector needs to be initialized explicitly since we may have some missing columns.
SketchEntry default_entry{};
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
default_entry);
thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
d_max_keys.begin(), d_max_results.begin());
#elif defined(XGBOOST_USE_HIP)
auto new_end = thrust::reduce_by_key(
thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
d_max_keys.erase(new_end.first, d_max_keys.end());
d_max_values.erase(new_end.second, d_max_values.end());
// The device vector needs to be initialized explicitly since we may have some missing columns.
SketchEntry default_entry{};
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
default_entry);
thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(),
d_max_keys.begin(), d_max_results.begin());
#endif
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
auto max_it = MakeIndexTransformIter([&](auto i) {
if (IsCat(h_feature_types, i)) {
return max_values[i].value;

View File

@@ -41,7 +41,7 @@ class SketchContainer {
bst_row_t num_rows_;
bst_feature_t num_columns_;
int32_t num_bins_;
int32_t device_;
DeviceOrd device_;
// Double buffer as neither prune nor merge can be performed inplace.
dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
* \param device GPU ID.
*/
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
int32_t max_bin, bst_feature_t num_columns,
bst_row_t num_rows, int32_t device)
: num_rows_{num_rows},
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
CHECK_GE(device, 0);
// Initialize Sketches for this dmatrix
this->columns_ptr_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
CHECK(device.IsCUDA());
// Initialize Sketches for this dmatrix
this->columns_ptr_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types);
// Pull to device.
this->feature_types_.SetDevice(device);
this->feature_types_.ConstDeviceSpan();
this->feature_types_.ConstHostSpan();
this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types);
// Pull to device.
this->feature_types_.SetDevice(device);
this->feature_types_.ConstDeviceSpan();
this->feature_types_.ConstHostSpan();
auto d_feature_types = feature_types_.ConstDeviceSpan();
has_categorical_ =
!d_feature_types.empty() &&
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
common::IsCatOp{});
auto d_feature_types = feature_types_.ConstDeviceSpan();
has_categorical_ =
!d_feature_types.empty() &&
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
timer_.Init(__func__);
}
timer_.Init(__func__);
}
/* \brief Return GPU ID for this container. */
int32_t DeviceIdx() const { return device_; }
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
/* \brief Whether the predictor matrix contains categorical features. */
bool HasCategorical() const { return has_categorical_; }
/* \brief Accumulate weights of duplicated entries in input. */
@@ -175,9 +172,7 @@ class SketchContainer {
template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
@@ -195,7 +190,7 @@ class SketchContainer {
d_column_scan.data() + d_column_scan.size(), entries.data(),
entries.data() + entries.size(), scan_out.DevicePointer(),
entries.data(), detail::SketchUnique{}, key_comp);
#else
#elif defined(XGBOOST_USE_CUDA)
size_t n_uniques = dh::SegmentedUnique(
thrust::cuda::par(alloc), d_column_scan.data(),
d_column_scan.data() + d_column_scan.size(), entries.data(),

View File

@@ -35,13 +35,13 @@ struct WQSummary {
/*! \brief an entry in the sketch summary */
struct Entry {
/*! \brief minimum rank */
RType rmin;
RType rmin{};
/*! \brief maximum rank */
RType rmax;
RType rmax{};
/*! \brief maximum weight */
RType wmin;
RType wmin{};
/*! \brief the value of data */
DType value;
DType value{};
// constructor
XGBOOST_DEVICE Entry() {} // NOLINT
// constructor

View File

@@ -1,19 +1,19 @@
/**
* Copyright 2023 by XGBoost contributors
* Copyright 2023, XGBoost contributors
*/
#include "quantile_loss_utils.h"
#include <cctype> // std::isspace
#include <istream> // std::istream
#include <ostream> // std::ostream
#include <string> // std::string
#include <vector> // std::vector
#include <cctype> // for isspace
#include <istream> // for istream
#include <ostream> // for ostream
#include <string> // for string
#include <vector> // for vector
#include "xgboost/json.h" // F32Array,TypeCheck,get,Number
#include "xgboost/json_io.h" // JsonWriter
#include "../common/json_utils.h" // for TypeCheck
#include "xgboost/json.h" // for F32Array, get, Number
#include "xgboost/json_io.h" // for JsonWriter
namespace xgboost {
namespace common {
namespace xgboost::common {
std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
auto const& t = array.Get();
xgboost::F32Array arr{t.size()};
@@ -70,5 +70,4 @@ std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
}
DMLC_REGISTER_PARAMETER(QuantileLossParam);
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -197,10 +197,10 @@ class RankingCache {
CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
<< error::GroupSize() << "the size of label.";
}
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
if (ctx->IsCUDA()) {
this->InitOnCUDA(ctx, info);
} else {
this->InitOnCPU(ctx, info);
}
if (!info.weights_.Empty()) {
CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
@@ -218,7 +218,7 @@ class RankingCache {
// Constructed as [1, n_samples] if group ptr is not supplied by the user
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
group_ptr_.SetDevice(ctx->Device());
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan();
}
[[nodiscard]] auto const& Param() const { return param_; }
@@ -231,10 +231,10 @@ class RankingCache {
sorted_idx_cache_.SetDevice(ctx->Device());
sorted_idx_cache_.Resize(predt.size());
}
if (ctx->IsCPU()) {
return this->MakeRankOnCPU(ctx, predt);
} else {
if (ctx->IsCUDA()) {
return this->MakeRankOnCUDA(ctx, predt);
} else {
return this->MakeRankOnCPU(ctx, predt);
}
}
// The function simply returns a uninitialized buffer as this is only used by the
@@ -307,10 +307,10 @@ class NDCGCache : public RankingCache {
public:
NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
if (ctx->IsCUDA()) {
this->InitOnCUDA(ctx, info);
} else {
this->InitOnCPU(ctx, info);
}
}
@@ -318,7 +318,7 @@ class NDCGCache : public RankingCache {
return inv_idcg_.View(ctx->Device());
}
common::Span<double const> Discount(Context const* ctx) const {
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan();
}
linalg::VectorView<double> Dcg(Context const* ctx) {
if (dcg_.Size() == 0) {
@@ -387,10 +387,10 @@ class PreCache : public RankingCache {
public:
PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
if (ctx->IsCUDA()) {
this->InitOnCUDA(ctx, info);
} else {
this->InitOnCPU(ctx, info);
}
}
@@ -399,7 +399,7 @@ class PreCache : public RankingCache {
pre_.SetDevice(ctx->Device());
pre_.Resize(this->Groups());
}
return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan();
}
};
@@ -418,10 +418,10 @@ class MAPCache : public RankingCache {
public:
MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
: RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
if (ctx->IsCPU()) {
this->InitOnCPU(ctx, info);
} else {
if (ctx->IsCUDA()) {
this->InitOnCUDA(ctx, info);
} else {
this->InitOnCPU(ctx, info);
}
}
@@ -430,21 +430,21 @@ class MAPCache : public RankingCache {
n_rel_.SetDevice(ctx->Device());
n_rel_.Resize(n_samples_);
}
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan();
}
common::Span<double> Acc(Context const* ctx) {
if (acc_.Empty()) {
acc_.SetDevice(ctx->Device());
acc_.Resize(n_samples_);
}
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan();
}
common::Span<double> Map(Context const* ctx) {
if (map_.Empty()) {
map_.SetDevice(ctx->Device());
map_.Resize(this->Groups());
}
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan();
}
};

View File

@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
return Span{data(), size()}.size_bytes();
return Span<const value_type>{data(), size()}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT

View File

@@ -15,8 +15,7 @@
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
#include "xgboost/logging.h" // CHECK_EQ
namespace xgboost {
namespace common {
namespace xgboost::common {
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
if (!ctx->IsCPU()) {
@@ -46,11 +45,13 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
}
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
v.SetDevice(ctx->gpu_id);
out->SetDevice(ctx->gpu_id);
v.SetDevice(ctx->Device());
out->SetDevice(ctx->Device());
out->Reshape(1);
if (ctx->IsCPU()) {
if (ctx->IsCUDA()) {
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
} else {
auto h_v = v.HostView();
float n = v.Size();
MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
@@ -58,9 +59,6 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
[&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
out->HostView()(0) = ret;
} else {
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -15,19 +15,16 @@
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
namespace xgboost {
namespace common {
namespace cuda_impl {
#if defined(XGBOOST_USE_HIP)
namespace cub = hipcub;
#endif
namespace xgboost::common::cuda_impl {
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
CHECK_GE(t.Shape(1), 1);
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
segments.SetDevice(ctx->gpu_id);
segments.SetDevice(ctx->Device());
auto d_segments = segments.DeviceSpan();
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -36,7 +33,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
});
out->SetDevice(ctx->gpu_id);
out->SetDevice(ctx->Device());
out->Reshape(t.Shape(1));
if (weights.Empty()) {
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -65,6 +62,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
dh::TemporaryArray<char> temp{bytes};
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
}
} // namespace cuda_impl
} // namespace common
} // namespace xgboost
} // namespace xgboost::common::cuda_impl

View File

@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
auto d_sorted_idx = dh::ToSpan(sorted_idx);
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
quantiles->SetDevice(ctx->gpu_id);
quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan();
@@ -226,7 +226,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
#endif
auto n_segments = std::distance(seg_beg, seg_end) - 1;
quantiles->SetDevice(ctx->gpu_id);
quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan();
auto d_weight_cdf = dh::ToSpan(weights_cdf);

View File

@@ -3,14 +3,23 @@
*/
#include "threading_utils.h"
#include <fstream>
#include <string>
#include <algorithm> // for max
#include <exception> // for exception
#include <filesystem> // for path, exists
#include <fstream> // for ifstream
#include <string> // for string
#include "xgboost/logging.h"
#include "common.h" // for DivRoundUp
namespace xgboost {
namespace common {
int32_t GetCfsCPUCount() noexcept {
namespace xgboost::common {
/**
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
std::filesystem::path const& peroid_path) {
#if defined(__linux__)
// https://bugs.openjdk.java.net/browse/JDK-8146115
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
}
};
// complete fair scheduler from Linux
auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
auto const cfs_quota(read_int(quota_path.c_str()));
auto const cfs_period(read_int(peroid_path.c_str()));
if ((cfs_quota > 0) && (cfs_period > 0)) {
return std::max(cfs_quota / cfs_period, 1);
}
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
return -1;
}
std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
std::int32_t cnt{-1};
#if defined(__linux__)
namespace fs = std::filesystem;
std::int32_t a{0}, b{0};
auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
try {
std::ifstream fin{bandwidth_path, std::ios::in};
fin >> a;
fin >> b;
} catch (std::exception const&) {
warn();
return cnt;
}
if (a > 0 && b > 0) {
cnt = std::max(common::DivRoundUp(a, b), 1);
}
#endif // defined(__linux__)
return cnt;
}
std::int32_t GetCfsCPUCount() noexcept {
namespace fs = std::filesystem;
fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
auto has_v2 = fs::exists(bandwidth_path);
if (has_v2) {
return GetCGroupV2Count(bandwidth_path);
}
fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
if (has_v1) {
return GetCGroupV1Count(quota_path, peroid_path);
}
return -1;
}
std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
// Don't use parallel if we are in a parallel region.
if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
n_threads = std::max(n_threads, 1);
return n_threads;
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
* \brief Get thread limit from CFS.
*
* This function has non-trivial overhead and should not be called repeatly.
*
* Modified from
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
*
* MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCfsCPUCount() noexcept;

View File

@@ -62,8 +62,8 @@ class Transform {
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
/*!
* \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -73,7 +73,7 @@ class Transform {
*/
template <typename... HDV>
void Eval(HDV... vectors) const {
bool on_device = device_ >= 0;
bool on_device = device_.IsCUDA();
if (on_device) {
LaunchCUDA(func_, vectors...);
@@ -118,11 +118,11 @@ class Transform {
}
// Recursive unpack for Shard.
template <typename T>
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
vector->SetDevice(device);
}
template <typename Head, typename... Rest>
void UnpackShard(int device,
void UnpackShard(DeviceOrd device,
const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->SetDevice(device);
@@ -142,13 +142,7 @@ class Transform {
// granularity is used in data vector.
size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_));
#elif defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_));
#endif
dh::safe_cuda(cudaSetDevice(device_.ordinal));
const int kGrids =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
if (kGrids == 0) {
@@ -182,7 +176,7 @@ class Transform {
/*! \brief Range object specifying parallel threads index range. */
Range range_;
int32_t n_threads_;
int32_t device_;
DeviceOrd device_;
};
public:
@@ -200,8 +194,8 @@ class Transform {
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
int32_t device_idx) {
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
DeviceOrd device) {
return Evaluator<Functor>{func, std::move(range), n_threads, device};
}
};

View File

@@ -20,7 +20,6 @@ namespace xgboost {
DMLC_REGISTER_PARAMETER(Context);
bst_d_ordinal_t constexpr Context::kCpuId;
std::int64_t constexpr Context::kDefaultSeed;
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
return std::nullopt;
}
std::int32_t parsed_id{Context::kCpuId};
std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
if (res.ec != std::errc()) {
return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
DeviceOrd device;
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check.
device.ordinal = DeviceOrd::InvalidOrdinal(); // mark it invalid for check.
if (split_it == s_device.cend()) {
// no ordinal.
if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
device = DeviceOrd::CUDA(opt_id.value());
}
if (device.ordinal < Context::kCpuId) {
if (device.ordinal < DeviceOrd::CPUOrdinal()) {
fatal();
}
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
}
} // namespace
std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
os << ord.Name();
return os;
}
void Context::Init(Args const& kwargs) {
auto unknown = this->UpdateAllowUnknown(kwargs);
if (!unknown.empty()) {
std::stringstream ss;
std::size_t i = 0;
ss << "[Internal Error] Unknown parameters passed to the Context {";
for (auto const& [k, _] : unknown) {
ss << '"' << k << '"';
if (++i != unknown.size()) {
ss << ", ";
}
}
ss << "}\n";
LOG(FATAL) << ss.str();
}
}
void Context::ConfigureGpuId(bool require_gpu) {
if (this->IsCPU() && require_gpu) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
error::WarnDeprecatedGPUId();
auto opt_id = ParseInt(StringView{gpu_id_it->second});
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
if (opt_id.value() > Context::kCpuId) {
if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
} else {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
this->SetDevice(new_d);
if (this->IsCPU()) {
CHECK_EQ(this->device_.ordinal, kCpuId);
CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
} else {
CHECK_GT(this->device_.ordinal, kCpuId);
CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
}
}

28
src/data/adapter.cc Normal file
View File

@@ -0,0 +1,28 @@
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#include "adapter.h"
#include "../c_api/c_api_error.h" // for API_BEGIN, API_END
#include "xgboost/c_api.h"
namespace xgboost::data {
template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
bool IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>::Next() {
if ((*next_callback_)(
data_handle_,
[](void *handle, XGBoostBatchCSR batch) -> int {
API_BEGIN();
static_cast<IteratorAdapter *>(handle)->SetData(batch);
API_END();
},
this) != 0) {
at_first_ = false;
return true;
} else {
return false;
}
}
template class IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
} // namespace xgboost::data

View File

@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2019~2021 by Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
* \file adapter.h
*/
#ifndef XGBOOST_DATA_ADAPTER_H_
@@ -16,11 +16,9 @@
#include <utility> // std::move
#include <vector>
#include "../c_api/c_api_error.h"
#include "../common/error_msg.h" // for MaxFeatureSize
#include "../common/math.h"
#include "array_interface.h"
#include "arrow-cdi.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/logging.h"
@@ -743,8 +741,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
dmlc::Parser<uint32_t>* parser_;
};
/*! \brief Data iterator that takes callback to return data, used in JVM package for
* accepting data iterator. */
/**
* @brief Data iterator that takes callback to return data, used in JVM package for accepting data
* iterator.
*/
template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
public:
@@ -758,23 +758,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
CHECK(at_first_) << "Cannot reset IteratorAdapter";
}
bool Next() override {
if ((*next_callback_)(
data_handle_,
[](void *handle, XGBoostBatchCSR batch) -> int {
API_BEGIN();
static_cast<IteratorAdapter *>(handle)->SetData(batch);
API_END();
},
this) != 0) {
at_first_ = false;
return true;
} else {
return false;
}
}
[[nodiscard]] bool Next() override;
FileAdapterBatch const& Value() const override {
[[nodiscard]] FileAdapterBatch const& Value() const override {
return *batch_.get();
}
@@ -822,12 +808,12 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
block_.index = dmlc::BeginPtr(index_);
block_.value = dmlc::BeginPtr(value_);
batch_.reset(new FileAdapterBatch(&block_, row_offset_));
batch_ = std::make_unique<FileAdapterBatch>(&block_, row_offset_);
row_offset_ += offset_.size() - 1;
}
size_t NumColumns() const { return columns_; }
size_t NumRows() const { return kAdapterUnknownSize; }
[[nodiscard]] std::size_t NumColumns() const { return columns_; }
[[nodiscard]] std::size_t NumRows() const { return kAdapterUnknownSize; }
private:
std::vector<size_t> offset_;
@@ -849,356 +835,6 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
std::unique_ptr<FileAdapterBatch> batch_;
};
enum ColumnDType : uint8_t {
kUnknown,
kInt8,
kUInt8,
kInt16,
kUInt16,
kInt32,
kUInt32,
kInt64,
kUInt64,
kFloat,
kDouble
};
class Column {
public:
Column() = default;
Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
: col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
virtual ~Column() = default;
Column(const Column&) = delete;
Column& operator=(const Column&) = delete;
Column(Column&&) = delete;
Column& operator=(Column&&) = delete;
// whether the valid bit is set for this element
bool IsValid(size_t row_idx) const {
return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
}
virtual COOTuple GetElement(size_t row_idx) const = 0;
virtual bool IsValidElement(size_t row_idx) const = 0;
virtual std::vector<float> AsFloatVector() const = 0;
virtual std::vector<uint64_t> AsUint64Vector() const = 0;
size_t Length() const { return length_; }
protected:
size_t col_idx_;
size_t length_;
size_t null_count_;
const uint8_t* bitmap_;
};
// Only columns of primitive types are supported. An ArrowColumnarBatch is a
// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
// derive from the abstract class Column.
template <typename T>
class PrimitiveColumn : public Column {
static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
public:
PrimitiveColumn(size_t idx, size_t length, size_t null_count,
const uint8_t* bitmap, const T* data, float missing)
: Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
COOTuple GetElement(size_t row_idx) const override {
CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
return { row_idx, col_idx_, IsValidElement(row_idx) ?
static_cast<float>(data_[row_idx]) : kNaN };
}
bool IsValidElement(size_t row_idx) const override {
// std::isfinite needs to cast to double to prevent msvc report error
return IsValid(row_idx)
&& std::isfinite(static_cast<double>(data_[row_idx]))
&& static_cast<float>(data_[row_idx]) != missing_;
}
std::vector<float> AsFloatVector() const override {
CHECK(data_) << "Column is empty";
std::vector<float> fv(length_);
std::transform(data_, data_ + length_, fv.begin(),
[](T v) { return static_cast<float>(v); });
return fv;
}
std::vector<uint64_t> AsUint64Vector() const override {
CHECK(data_) << "Column is empty";
std::vector<uint64_t> iv(length_);
std::transform(data_, data_ + length_, iv.begin(),
[](T v) { return static_cast<uint64_t>(v); });
return iv;
}
private:
const T* data_;
float missing_; // user specified missing value
};
struct ColumnarMetaInfo {
// data type of the column
ColumnDType type{ColumnDType::kUnknown};
// location of the column in an Arrow record batch
int64_t loc{-1};
};
struct ArrowSchemaImporter {
std::vector<ColumnarMetaInfo> columns;
// map Arrow format strings to types
static ColumnDType FormatMap(char const* format_str) {
CHECK(format_str) << "Format string cannot be empty";
switch (format_str[0]) {
case 'c':
return ColumnDType::kInt8;
case 'C':
return ColumnDType::kUInt8;
case 's':
return ColumnDType::kInt16;
case 'S':
return ColumnDType::kUInt16;
case 'i':
return ColumnDType::kInt32;
case 'I':
return ColumnDType::kUInt32;
case 'l':
return ColumnDType::kInt64;
case 'L':
return ColumnDType::kUInt64;
case 'f':
return ColumnDType::kFloat;
case 'g':
return ColumnDType::kDouble;
default:
CHECK(false) << "Column data type not supported by XGBoost";
return ColumnDType::kUnknown;
}
}
void Import(struct ArrowSchema *schema) {
if (schema) {
CHECK(std::string(schema->format) == "+s"); // NOLINT
CHECK(columns.empty());
for (auto i = 0; i < schema->n_children; ++i) {
std::string name{schema->children[i]->name};
ColumnDType type = FormatMap(schema->children[i]->format);
ColumnarMetaInfo col_info{type, i};
columns.push_back(col_info);
}
if (schema->release) {
schema->release(schema);
}
}
}
};
class ArrowColumnarBatch {
public:
ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
: rb_{rb}, schema_{schema} {
CHECK(rb_) << "Cannot import non-existent record batch";
CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
}
size_t Import(float missing) {
auto& infov = schema_->columns;
for (size_t i = 0; i < infov.size(); ++i) {
columns_.push_back(CreateColumn(i, infov[i], missing));
}
// Compute the starting location for every row in this batch
auto batch_size = rb_->length;
auto num_columns = columns_.size();
row_offsets_.resize(batch_size + 1, 0);
for (auto i = 0; i < batch_size; ++i) {
row_offsets_[i+1] = row_offsets_[i];
for (size_t j = 0; j < num_columns; ++j) {
if (GetColumn(j).IsValidElement(i)) {
row_offsets_[i+1]++;
}
}
}
// return number of elements in the batch
return row_offsets_.back();
}
ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
virtual ~ArrowColumnarBatch() {
if (rb_ && rb_->release) {
rb_->release(rb_);
rb_ = nullptr;
}
columns_.clear();
}
size_t Size() const { return rb_ ? rb_->length : 0; }
size_t NumColumns() const { return columns_.size(); }
size_t NumElements() const { return row_offsets_.back(); }
const Column& GetColumn(size_t col_idx) const {
return *columns_[col_idx];
}
void ShiftRowOffsets(size_t batch_offset) {
std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
[=](size_t c) { return c + batch_offset; });
}
const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
private:
std::shared_ptr<Column> CreateColumn(size_t idx,
ColumnarMetaInfo info,
float missing) const {
if (info.loc < 0) {
return nullptr;
}
auto loc_in_batch = info.loc;
auto length = rb_->length;
auto null_count = rb_->null_count;
auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
// if null_count is not computed, compute it here
if (null_count < 0) {
if (!bitmap) {
null_count = 0;
} else {
null_count = length;
for (auto i = 0; i < length; ++i) {
if (bitmap[i/8] & (1 << (i%8))) {
null_count--;
}
}
}
}
switch (info.type) {
case ColumnDType::kInt8:
return std::make_shared<PrimitiveColumn<int8_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const int8_t*>(data), missing);
case ColumnDType::kUInt8:
return std::make_shared<PrimitiveColumn<uint8_t>>(
idx, length, null_count, bitmap, data, missing);
case ColumnDType::kInt16:
return std::make_shared<PrimitiveColumn<int16_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const int16_t*>(data), missing);
case ColumnDType::kUInt16:
return std::make_shared<PrimitiveColumn<uint16_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const uint16_t*>(data), missing);
case ColumnDType::kInt32:
return std::make_shared<PrimitiveColumn<int32_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const int32_t*>(data), missing);
case ColumnDType::kUInt32:
return std::make_shared<PrimitiveColumn<uint32_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const uint32_t*>(data), missing);
case ColumnDType::kInt64:
return std::make_shared<PrimitiveColumn<int64_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const int64_t*>(data), missing);
case ColumnDType::kUInt64:
return std::make_shared<PrimitiveColumn<uint64_t>>(
idx, length, null_count, bitmap,
reinterpret_cast<const uint64_t*>(data), missing);
case ColumnDType::kFloat:
return std::make_shared<PrimitiveColumn<float>>(
idx, length, null_count, bitmap,
reinterpret_cast<const float*>(data), missing);
case ColumnDType::kDouble:
return std::make_shared<PrimitiveColumn<double>>(
idx, length, null_count, bitmap,
reinterpret_cast<const double*>(data), missing);
default:
return nullptr;
}
}
struct ArrowArray* rb_;
struct ArrowSchemaImporter* schema_;
std::vector<std::shared_ptr<Column>> columns_;
std::vector<size_t> row_offsets_;
};
using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
public:
RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
: next_callback_{next_callback}, nbatches_{nbatch} {}
void BeforeFirst() override {
CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
}
bool Next() override {
batches_.clear();
while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
at_first_ = false;
}
if (batches_.size() > 0) {
return true;
} else {
return false;
}
}
void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
// Schema is only imported once at the beginning, regardless how many
// baches are comming.
// But even schema is not imported we still need to release its C data
// exported from Arrow.
if (at_first_ && schema) {
schema_.Import(schema);
} else {
if (schema && schema->release) {
schema->release(schema);
}
}
if (rb) {
batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
}
}
const ArrowColumnarBatchVec& Value() const override {
return batches_;
}
size_t NumColumns() const { return schema_.columns.size(); }
size_t NumRows() const { return kAdapterUnknownSize; }
private:
XGDMatrixCallbackNext *next_callback_;
bool at_first_{true};
int nbatches_;
struct ArrowSchemaImporter schema_;
ArrowColumnarBatchVec batches_;
};
class SparsePageAdapterBatch {
HostSparsePageView page_;

View File

@@ -16,7 +16,7 @@
#include <utility>
#include <vector>
#include "../common/bitfield.h"
#include "../common/bitfield.h" // for RBitField8
#include "../common/common.h"
#include "../common/error_msg.h" // for NoF128
#include "xgboost/base.h"
@@ -106,7 +106,20 @@ struct ArrayInterfaceErrors {
*/
class ArrayInterfaceHandler {
public:
enum Type : std::int8_t { kF2, kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
enum Type : std::int8_t {
kF2 = 0,
kF4 = 1,
kF8 = 2,
kF16 = 3,
kI1 = 4,
kI2 = 5,
kI4 = 6,
kI8 = 7,
kU1 = 8,
kU2 = 9,
kU4 = 10,
kU8 = 11,
};
template <typename PtrType>
static PtrType GetPtrFromArrayData(Object::Map const &obj) {
@@ -589,6 +602,57 @@ class ArrayInterface {
ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
};
template <typename Fn>
auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
switch (dtype) {
case ArrayInterfaceHandler::kF2: {
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
return dispatch(__half{});
#else
LOG(FATAL) << "half type is only supported for CUDA input.";
break;
#endif
}
case ArrayInterfaceHandler::kF4: {
return dispatch(float{});
}
case ArrayInterfaceHandler::kF8: {
return dispatch(double{});
}
case ArrayInterfaceHandler::kF16: {
using T = long double;
CHECK(sizeof(T) == 16) << error::NoF128();
return dispatch(T{});
}
case ArrayInterfaceHandler::kI1: {
return dispatch(std::int8_t{});
}
case ArrayInterfaceHandler::kI2: {
return dispatch(std::int16_t{});
}
case ArrayInterfaceHandler::kI4: {
return dispatch(std::int32_t{});
}
case ArrayInterfaceHandler::kI8: {
return dispatch(std::int64_t{});
}
case ArrayInterfaceHandler::kU1: {
return dispatch(std::uint8_t{});
}
case ArrayInterfaceHandler::kU2: {
return dispatch(std::uint16_t{});
}
case ArrayInterfaceHandler::kU4: {
return dispatch(std::uint32_t{});
}
case ArrayInterfaceHandler::kU8: {
return dispatch(std::uint64_t{});
}
}
return std::result_of_t<Fn(std::int8_t)>();
}
template <std::int32_t D, typename Fn>
void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
// Only used for cuDF at the moment.
@@ -604,60 +668,7 @@ void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
std::numeric_limits<std::size_t>::max()},
array.shape, array.strides, device});
};
switch (array.type) {
case ArrayInterfaceHandler::kF2: {
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
dispatch(__half{});
#endif
break;
}
case ArrayInterfaceHandler::kF4: {
dispatch(float{});
break;
}
case ArrayInterfaceHandler::kF8: {
dispatch(double{});
break;
}
case ArrayInterfaceHandler::kF16: {
using T = long double;
CHECK(sizeof(long double) == 16) << error::NoF128();
dispatch(T{});
break;
}
case ArrayInterfaceHandler::kI1: {
dispatch(std::int8_t{});
break;
}
case ArrayInterfaceHandler::kI2: {
dispatch(std::int16_t{});
break;
}
case ArrayInterfaceHandler::kI4: {
dispatch(std::int32_t{});
break;
}
case ArrayInterfaceHandler::kI8: {
dispatch(std::int64_t{});
break;
}
case ArrayInterfaceHandler::kU1: {
dispatch(std::uint8_t{});
break;
}
case ArrayInterfaceHandler::kU2: {
dispatch(std::uint16_t{});
break;
}
case ArrayInterfaceHandler::kU4: {
dispatch(std::uint32_t{});
break;
}
case ArrayInterfaceHandler::kU8: {
dispatch(std::uint64_t{});
break;
}
}
DispatchDType(array.type, dispatch);
}
/**

View File

@@ -1,66 +0,0 @@
/* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
#include <cstdint>
#ifdef __cplusplus
extern "C" {
#endif
#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4
struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
#ifdef __cplusplus
}
#endif

View File

@@ -635,22 +635,39 @@ void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
}
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
if (size != 0 && this->num_col_ != 0) {
if (size != 0 && this->num_col_ != 0 && !IsColumnSplit()) {
CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
CHECK(info);
}
if (!std::strcmp(key, "feature_type")) {
feature_type_names.clear();
auto& h_feature_types = feature_types.HostVector();
for (size_t i = 0; i < size; ++i) {
auto elem = info[i];
feature_type_names.emplace_back(elem);
}
if (IsColumnSplit()) {
feature_type_names = collective::AllgatherStrings(feature_type_names);
CHECK_EQ(feature_type_names.size(), num_col_)
<< "Length of " << key << " must be equal to number of columns.";
}
auto& h_feature_types = feature_types.HostVector();
LoadFeatureType(feature_type_names, &h_feature_types);
} else if (!std::strcmp(key, "feature_name")) {
feature_names.clear();
for (size_t i = 0; i < size; ++i) {
feature_names.emplace_back(info[i]);
if (IsColumnSplit()) {
std::vector<std::string> local_feature_names{};
auto const rank = collective::GetRank();
for (std::size_t i = 0; i < size; ++i) {
auto elem = std::to_string(rank) + "." + info[i];
local_feature_names.emplace_back(elem);
}
feature_names = collective::AllgatherStrings(local_feature_names);
CHECK_EQ(feature_names.size(), num_col_)
<< "Length of " << key << " must be equal to number of columns.";
} else {
feature_names.clear();
for (size_t i = 0; i < size; ++i) {
feature_names.emplace_back(info[i]);
}
}
} else {
LOG(FATAL) << "Unknown feature info name: " << key;
@@ -687,13 +704,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
linalg::Stack(&this->labels, that.labels);
this->weights_.SetDevice(that.weights_.DeviceIdx());
this->weights_.SetDevice(that.weights_.Device());
this->weights_.Extend(that.weights_);
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
this->labels_lower_bound_.Extend(that.labels_lower_bound_);
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
this->labels_upper_bound_.Extend(that.labels_upper_bound_);
linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,13 +740,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
}
if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size());
this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
this->feature_weights.SetDevice(that.feature_weights.Device());
this->feature_weights.Copy(that.feature_weights);
}
}
void MetaInfo::SynchronizeNumberOfColumns() {
if (IsVerticalFederated()) {
if (IsColumnSplit()) {
collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
} else {
collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
@@ -738,22 +755,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
namespace {
template <typename T>
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
if (!valid) {
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
"the booster. The device ordinal of the data is: "
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
<< v.Device() << "; the device ordinal of the Booster is: " << device;
}
}
template <typename T, std::int32_t D>
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
CheckDevice(device, *v.Data());
}
} // anonymous namespace
void MetaInfo::Validate(std::int32_t device) const {
void MetaInfo::Validate(DeviceOrd device) const {
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
return;
@@ -850,14 +867,6 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
} // namespace
DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
auto need_split = false;
if (collective::IsFederated()) {
LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
} else if (collective::IsDistributed()) {
LOG(CONSOLE) << "XGBoost distributed mode detected, will split data among workers";
need_split = true;
}
std::string fname, cache_file;
auto dlm_pos = uri.find('#');
if (dlm_pos != std::string::npos) {
@@ -865,24 +874,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
fname = uri.substr(0, dlm_pos);
CHECK_EQ(cache_file.find('#'), std::string::npos)
<< "Only one `#` is allowed in file path for cache file specification.";
if (need_split && data_split_mode == DataSplitMode::kRow) {
std::ostringstream os;
std::vector<std::string> cache_shards = common::Split(cache_file, ':');
for (size_t i = 0; i < cache_shards.size(); ++i) {
size_t pos = cache_shards[i].rfind('.');
if (pos == std::string::npos) {
os << cache_shards[i] << ".r" << collective::GetRank() << "-"
<< collective::GetWorldSize();
} else {
os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
<< collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
}
if (i + 1 != cache_shards.size()) {
os << ':';
}
}
cache_file = os.str();
}
} else {
fname = uri;
}
@@ -894,19 +885,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
}
int partid = 0, npart = 1;
if (need_split && data_split_mode == DataSplitMode::kRow) {
partid = collective::GetRank();
npart = collective::GetWorldSize();
} else {
// test option to load in part
npart = 1;
}
if (npart != 1) {
LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
}
DMatrix* dmat{nullptr};
DMatrix* dmat{};
if (cache_file.empty()) {
fname = data::ValidateFileFormat(fname);
@@ -916,6 +895,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
cache_file, data_split_mode);
} else {
CHECK(data_split_mode != DataSplitMode::kCol)
<< "Column-wise data split is not supported for external memory.";
data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
dmat = new data::SparsePageDMatrix{&iter,
iter.Proxy(),
@@ -926,17 +907,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
cache_file};
}
if (need_split && data_split_mode == DataSplitMode::kCol) {
if (!cache_file.empty()) {
LOG(FATAL) << "Column-wise data split is not support for external memory.";
}
LOG(CONSOLE) << "Splitting data by column";
auto* sliced = dmat->SliceCol(npart, partid);
delete dmat;
return sliced;
} else {
return dmat;
}
return dmat;
}
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
@@ -1011,9 +982,6 @@ template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter*
template DMatrix* DMatrix::Create(
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
DataSplitMode data_split_mode);
SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
SparsePage transpose;

View File

@@ -33,13 +33,13 @@ template <typename T, int32_t D>
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
ArrayInterface<D> array(arr_interface);
if (array.n == 0) {
p_out->SetDevice(0);
p_out->SetDevice(DeviceOrd::CUDA(0));
p_out->Reshape(array.shape);
return;
}
CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value.";
auto ptr_device = SetDeviceToPtr(array.data);
auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
p_out->SetDevice(ptr_device);
if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -55,7 +55,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
return;
}
p_out->Reshape(array.shape);
auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
auto t = p_out->View(ptr_device);
linalg::ElementWiseTransformDevice(
t,
[=] __device__(size_t i, T) {
@@ -91,7 +91,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
});
dh::caching_device_vector<bool> flag(1);
auto d_flag = dh::ToSpan(flag);
auto d = SetDeviceToPtr(array_interface.data);
auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
auto typed = TypedIndex<uint32_t, 1>{array_interface};

View File

@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
: columns_(columns),
num_rows_(num_rows) {}
size_t Size() const { return num_rows_ * columns_.size(); }
__device__ __forceinline__ COOTuple GetElement(size_t idx) const {
[[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
[[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % columns_.size();
size_t row_idx = idx / columns_.size();
auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
auto const& column = columns_[fidx];
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
private:
common::Span<ArrayInterface<1>> columns_;
@@ -120,16 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
return;
}
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, Context::kCpuId);
dh::safe_cuda(cudaSetDevice(device_idx_));
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
CHECK(device_.IsCUDA());
dh::safe_cuda(cudaSetDevice(device_.ordinal));
for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col));
columns.push_back(column);
num_rows_ = std::max(num_rows_, column.Shape(0));
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
<< "All columns should use the same device.";
CHECK_EQ(num_rows_, column.Shape(0))
<< "All columns should have same number of rows.";
@@ -145,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
return batch_;
}
size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return columns_.size(); }
int32_t DeviceIdx() const { return device_idx_; }
[[nodiscard]] std::size_t NumRows() const { return num_rows_; }
[[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
[[nodiscard]] DeviceOrd Device() const { return device_; }
private:
CudfAdapterBatch batch_;
dh::device_vector<ArrayInterface<1>> columns_;
size_t num_rows_{0};
int32_t device_idx_{Context::kCpuId};
DeviceOrd device_{DeviceOrd::CPU()};
};
class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -161,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
CupyAdapterBatch() = default;
explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
: array_interface_(std::move(array_interface)) {}
size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return array_interface_.Shape(0) * array_interface_.Shape(1);
}
__device__ COOTuple GetElement(size_t idx) const {
[[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % array_interface_.Shape(1);
size_t row_idx = idx / array_interface_.Shape(1);
float value = array_interface_(row_idx, column_idx);
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
float value = array_interface_(ridx, fidx);
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
private:
ArrayInterface<2> array_interface_;
@@ -191,29 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
if (array_interface_.Shape(0) == 0) {
return;
}
device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
CHECK_NE(device_idx_, Context::kCpuId);
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
CHECK(device_.IsCUDA());
}
explicit CupyAdapter(std::string cuda_interface_str)
: CupyAdapter{StringView{cuda_interface_str}} {}
const CupyAdapterBatch& Value() const override { return batch_; }
[[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
size_t NumRows() const { return array_interface_.Shape(0); }
size_t NumColumns() const { return array_interface_.Shape(1); }
int32_t DeviceIdx() const { return device_idx_; }
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
[[nodiscard]] DeviceOrd Device() const { return device_; }
private:
ArrayInterface<2> array_interface_;
CupyAdapterBatch batch_;
int32_t device_idx_ {Context::kCpuId};
DeviceOrd device_{DeviceOrd::CPU()};
};
// Returns maximum row length
template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
float missing) {
dh::safe_cuda(cudaSetDevice(device_idx));
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));

View File

@@ -98,23 +98,18 @@ __global__ void CompressBinEllpackKernel(
}
// Construct an ELLPACK matrix with the given number of empty rows.
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
bool is_dense, size_t row_stride,
size_t n_rows)
: is_dense(is_dense),
cuts_(std::move(cuts)),
row_stride(row_stride),
n_rows(n_rows) {
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
size_t row_stride, size_t n_rows)
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
monitor_.Start("InitCompressedData");
InitCompressedData(device);
monitor_.Stop("InitCompressedData");
}
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
const SparsePage &page, bool is_dense,
size_t row_stride,
common::Span<FeatureType const> feature_types)
@@ -128,7 +123,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
: is_dense(dmat->IsDense()) {
monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
n_rows = dmat->Info().num_row_;
@@ -143,15 +138,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
monitor_.Stop("Quantiles");
monitor_.Start("InitCompressedData");
this->InitCompressedData(ctx->gpu_id);
this->InitCompressedData(ctx->Device());
monitor_.Stop("InitCompressedData");
dmat->Info().feature_types.SetDevice(ctx->gpu_id);
dmat->Info().feature_types.SetDevice(ctx->Device());
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
monitor_.Start("BinningCompression");
CHECK(dmat->SingleColBlock());
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(ctx->gpu_id, batch, ft);
CreateHistIndices(ctx->Device(), batch, ft);
}
monitor_.Stop("BinningCompression");
}
@@ -214,7 +209,7 @@ struct TupleScanOp {
// to remove missing data
template <typename AdapterBatchT>
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
EllpackPageImpl* dst, int device_idx, float missing) {
EllpackPageImpl* dst, DeviceOrd device, float missing) {
// Some witchcraft happens here
// The goal is to copy valid elements out of the input to an ELLPACK matrix
// with a given row stride, using no extra working memory Standard stream
@@ -246,7 +241,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
// Tuple[2] = The index in the input data
using Tuple = thrust::tuple<size_t, size_t, size_t>;
auto device_accessor = dst->GetDeviceAccessor(device_idx);
auto device_accessor = dst->GetDeviceAccessor(device);
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
@@ -298,10 +293,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
#endif
}
void WriteNullValues(EllpackPageImpl* dst, int device_idx,
common::Span<size_t> row_counts) {
void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
// Write the null values
auto device_accessor = dst->GetDeviceAccessor(device_idx);
auto device_accessor = dst->GetDeviceAccessor(device);
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
auto row_stride = dst->row_stride;
@@ -318,11 +312,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
}
template <typename AdapterBatch>
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts) {
dh::safe_cuda(cudaSetDevice(device));
dh::safe_cuda(cudaSetDevice(device.ordinal));
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -331,7 +325,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
template EllpackPageImpl::EllpackPageImpl( \
__BATCH_T batch, float missing, int device, bool is_dense, \
__BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
@@ -388,9 +382,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
[&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
row_stride = *std::max_element(it, it + page.Size());
CHECK_GE(ctx->gpu_id, 0);
CHECK(ctx->IsCUDA());
monitor_.Start("InitCompressedData");
InitCompressedData(ctx->gpu_id);
InitCompressedData(ctx->Device());
monitor_.Stop("InitCompressedData");
// copy gidx
@@ -400,7 +394,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
auto null = accessor.NullValue();
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
}
@@ -425,8 +419,7 @@ struct CopyPage {
};
// Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
size_t offset) {
size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride);
@@ -486,7 +479,7 @@ struct CompactPage {
};
// Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
common::Span<size_t> row_indexes) {
monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride);
@@ -499,13 +492,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
}
// Initialize the buffer to stored compressed features.
void EllpackPageImpl::InitCompressedData(int device) {
void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
size_t num_symbols = NumSymbols();
// Required buffer size for storing data matrix in ELLPack format.
size_t compressed_size_bytes =
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
num_symbols);
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
gidx_buffer.SetDevice(device);
// Don't call fill unnecessarily
if (gidx_buffer.Size() == 0) {
@@ -517,7 +509,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
}
// Compress a CSR page into ELLPACK.
void EllpackPageImpl::CreateHistIndices(int device,
void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
const SparsePage& row_batch,
common::Span<FeatureType const> feature_types) {
if (row_batch.Size() == 0) return;
@@ -527,7 +519,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
// bin and compress entries in batches of rows
size_t gpu_batch_nrows =
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
static_cast<size_t>(row_batch.Size()));
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -592,7 +584,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
}
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
int device, common::Span<FeatureType const> feature_types) const {
DeviceOrd device, common::Span<FeatureType const> feature_types) const {
gidx_buffer.SetDevice(device);
return {device,
cuts_,
@@ -606,7 +598,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
}
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
common::Span<FeatureType const> feature_types) const {
return {Context::kCpuId,
return {DeviceOrd::CPU(),
cuts_,
is_dense,
row_stride,

View File

@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
common::Span<const FeatureType> feature_types;
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
bool is_dense, size_t row_stride, size_t base_rowid,
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
size_t row_stride, size_t base_rowid, size_t n_rows,
common::CompressedIterator<uint32_t> gidx_iter,
common::Span<FeatureType const> feature_types)
: is_dense(is_dense),
row_stride(row_stride),
base_rowid(base_rowid),
n_rows(n_rows) ,gidx_iter(gidx_iter),
n_rows(n_rows),
gidx_iter(gidx_iter),
feature_types{feature_types} {
if (device == Context::kCpuId) {
if (device.IsCPU()) {
gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
feature_segments = cuts.cut_ptrs_.ConstHostSpan();
min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
}
// Get a matrix element, uses binary search for look up Return NaN if missing
// Given a row index and a feature index, returns the corresponding cut value
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
[[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
ridx -= base_rowid;
auto row_begin = row_stride * ridx;
auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
}
template <bool is_cat>
__device__ uint32_t SearchBin(float value, size_t column_id) const {
[[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
auto beg = feature_segments[column_id];
auto end = feature_segments[column_id + 1];
uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
return idx;
}
__device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
[[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
auto gidx = GetBinIndex(ridx, fidx);
if (gidx == -1) {
return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
}
// Check if the row id is withing range of the current batch.
__device__ bool IsInRange(size_t row_id) const {
[[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
return row_id >= base_rowid && row_id < base_rowid + n_rows;
}
/*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */
XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
[[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
[[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
[[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
[[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
};
@@ -141,14 +142,13 @@ class EllpackPageImpl {
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
* and the given number of rows.
*/
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
size_t row_stride, size_t n_rows);
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
size_t n_rows);
/*!
* \brief Constructor used for external memory.
*/
EllpackPageImpl(int device, common::HistogramCuts cuts,
const SparsePage &page, bool is_dense, size_t row_stride,
common::Span<FeatureType const> feature_types);
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
/*!
* \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
template <typename AdapterBatch>
explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
* @param offset The number of elements to skip before copying.
* @returns The number of elements copied.
*/
size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
/*! \brief Compact the given ELLPACK page into the current page.
*
@@ -184,11 +184,10 @@ class EllpackPageImpl {
* @param page The ELLPACK page to compact from.
* @param row_indexes Row indexes for the compacted page.
*/
void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
/*! \return Number of instances in the page. */
size_t Size() const;
[[nodiscard]] size_t Size() const;
/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {
/*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */
size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
[[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
EllpackDeviceAccessor
GetDeviceAccessor(int device,
common::Span<FeatureType const> feature_types = {}) const;
EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
[[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
common::Span<FeatureType const> feature_types = {}) const;
private:
/*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
* @param device The GPU device to use.
* @param row_batch The CSR page.
*/
void CreateHistIndices(int device,
void CreateHistIndices(DeviceOrd device,
const SparsePage& row_batch,
common::Span<FeatureType const> feature_types);
/*!
* \brief Initialize the buffer to store compressed features.
*/
void InitCompressedData(int device);
void InitCompressedData(DeviceOrd device);
public:

View File

@@ -10,7 +10,7 @@
namespace xgboost::data {
void EllpackPageSource::Fetch() {
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
if (!this->ReadCache()) {
if (count_ != 0 && !sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0

View File

@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
BatchParam param_;
common::Span<FeatureType const> feature_types_;
std::unique_ptr<common::HistogramCuts> cuts_;
std::int32_t device_;
DeviceOrd device_;
public:
EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
std::shared_ptr<Cache> cache, BatchParam param,
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
common::Span<FeatureType const> feature_types,
std::shared_ptr<SparsePageSource> source, std::int32_t device)
std::shared_ptr<SparsePageSource> source, DeviceOrd device)
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
is_dense_{is_dense},
row_stride_{row_stride},
@@ -50,6 +50,7 @@ inline void EllpackPageSource::Fetch() {
// silent the warning about unused variables.
(void)(row_stride_);
(void)(is_dense_);
(void)(device_);
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)

View File

@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
auto pctx = MakeProxy(proxy_)->Ctx();
Context ctx;
ctx.UpdateAllowUnknown(
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
// hardcoded parameter.
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
return HostAdapterDispatch(proxy, [&](auto const& value) {
size_t n_threads = ctx->Threads();
size_t n_features = column_sizes.size();
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
column_sizes_tloc.Data()->Fill(0ul);
auto view = column_sizes_tloc.HostView();
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {

View File

@@ -48,10 +48,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
int32_t current_device;
dh::safe_cuda(cudaGetDevice(&current_device));
auto get_device = [&]() -> int32_t {
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
CHECK_NE(d, Context::kCpuId);
auto get_device = [&]() {
auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
CHECK(!d.IsCPU());
return d;
};
@@ -61,11 +60,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
common::HistogramCuts cuts;
do {
// We use do while here as the first batch is fetched in ctor
// ctx_.gpu_id = proxy->DeviceIdx();
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
dh::safe_cuda(cudaSetDevice(get_device()));
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
if (cols == 0) {
cols = num_cols();
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -103,8 +99,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto n_features = cols;
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
dh::safe_cuda(cudaSetDevice(get_device()));
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
if (!ref) {
HostDeviceVector<FeatureType> ft;
common::SketchContainer final_sketch(
@@ -143,9 +138,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
size_t n_batches_for_verification = 0;
while (iter.Next()) {
init_page();
dh::safe_cuda(cudaSetDevice(get_device()));
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -197,18 +190,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
if (!ellpack_) {
ellpack_.reset(new EllpackPage());
if (ctx->IsCUDA()) {
this->Info().feature_types.SetDevice(ctx->gpu_id);
this->Info().feature_types.SetDevice(ctx->Device());
*ellpack_->Impl() =
EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
} else if (fmat_ctx_.IsCUDA()) {
this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
this->Info().feature_types.SetDevice(fmat_ctx_.Device());
*ellpack_->Impl() =
EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
} else {
// Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
// for cut reference.
auto cuda_ctx = ctx->MakeCUDA();
this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
this->Info().feature_types.SetDevice(cuda_ctx.Device());
*ellpack_->Impl() =
EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
}

View File

@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId;
this->ctx_.Init(Args{{"device", "cpu"}});
}
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
char const *c_values, bst_feature_t n_features, bool on_host) {
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
bst_feature_t n_features, bool on_host) {
CHECK(on_host) << "Not implemented on device.";
std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId;
this->ctx_.Init(Args{{"device", "cpu"}});
}
namespace cuda_impl {

View File

@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (adapter->DeviceIdx() < 0) {
if (adapter->Device().IsCPU()) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
}
void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (adapter->DeviceIdx() < 0) {
if (adapter->Device().IsCPU()) {
// empty data
CHECK_EQ(this->Info().num_row_, 0);
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return;
}
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
}
namespace cuda_impl {

View File

@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
public:
int DeviceIdx() const { return ctx_.gpu_id; }
DeviceOrd Device() const { return ctx_.Device(); }
void SetCUDAArray(char const* c_interface) {
common::AssertGPUSupport();

View File

@@ -75,11 +75,9 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
}
void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
if (info_.IsVerticalFederated()) {
std::vector<uint64_t> buffer(collective::GetWorldSize());
buffer[collective::GetRank()] = info_.num_col_;
collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
auto const cols = collective::Allgather(info_.num_col_);
auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
if (offset == 0) {
return;
}
@@ -253,7 +251,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
}
if (batch.BaseMargin() != nullptr) {
info_.base_margin_ = decltype(info_.base_margin_){
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
}
if (batch.Qid() != nullptr) {
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
@@ -361,78 +359,4 @@ template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int n
template SimpleDMatrix::SimpleDMatrix(
IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, DataSplitMode data_split_mode);
template <>
SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
DataSplitMode data_split_mode) {
Context ctx;
ctx.nthread = nthread;
auto& offset_vec = sparse_page_->offset.HostVector();
auto& data_vec = sparse_page_->data.HostVector();
uint64_t total_batch_size = 0;
uint64_t total_elements = 0;
adapter->BeforeFirst();
// Iterate over batches of input data
while (adapter->Next()) {
auto& batches = adapter->Value();
size_t num_elements = 0;
size_t num_rows = 0;
// Import Arrow RecordBatches
#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
for (int i = 0; i < static_cast<int>(batches.size()); ++i) { // NOLINT
num_elements += batches[i]->Import(missing);
num_rows += batches[i]->Size();
}
total_elements += num_elements;
total_batch_size += num_rows;
// Compute global offset for every row and starting row for every batch
std::vector<uint64_t> batch_offsets(batches.size());
for (size_t i = 0; i < batches.size(); ++i) {
if (i == 0) {
batch_offsets[i] = total_batch_size - num_rows;
batches[i]->ShiftRowOffsets(total_elements - num_elements);
} else {
batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
}
}
// Pre-allocate DMatrix memory
data_vec.resize(total_elements);
offset_vec.resize(total_batch_size + 1);
// Copy data into DMatrix
#pragma omp parallel num_threads(ctx.Threads())
{
#pragma omp for nowait
for (int i = 0; i < static_cast<int>(batches.size()); ++i) { // NOLINT
size_t begin = batches[i]->RowOffsets()[0];
for (size_t k = 0; k < batches[i]->Size(); ++k) {
for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
auto element = batches[i]->GetColumn(j).GetElement(k);
if (!std::isnan(element.value)) {
data_vec[begin++] = Entry(element.column_idx, element.value);
}
}
}
}
#pragma omp for nowait
for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
auto& offsets = batches[i]->RowOffsets();
std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
}
}
}
// Synchronise worker columns
info_.num_col_ = adapter->NumColumns();
info_.data_split_mode = data_split_mode;
ReindexFeatures(&ctx);
info_.SynchronizeNumberOfColumns();
info_.num_row_ = total_batch_size;
info_.num_nonzero_ = data_vec.size();
CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
fmat_ctx_ = ctx;
}
} // namespace xgboost::data

View File

@@ -10,9 +10,7 @@
#include "xgboost/context.h" // for Context
#include "xgboost/data.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
// Does not currently support metainfo as no on-device data source contains this
// Current implementation assumes a single batch. More batches can
// be supported in future. Does not currently support inferring row/column size
@@ -21,14 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
DataSplitMode data_split_mode) {
CHECK(data_split_mode != DataSplitMode::kCol)
<< "Column-wise data split is currently not supported on the GPU.";
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
: adapter->DeviceIdx();
CHECK_GE(device, 0);
dh::safe_cuda(cudaSetDevice(device));
auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
? DeviceOrd::CUDA(dh::CurrentDevice())
: adapter->Device();
CHECK(device.IsCUDA());
dh::safe_cuda(cudaSetDevice(device.ordinal));
Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
CHECK(adapter->NumRows() != kAdapterUnknownSize);
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -53,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
int nthread, DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
int nthread, DataSplitMode data_split_mode);
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -54,11 +54,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
}
template <typename AdapterBatchT>
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
int device_idx, float missing) {
dh::safe_cuda(cudaSetDevice(device_idx));
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
float missing) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
// Count elements per row
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -71,22 +69,19 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
});
dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_HIP)
thrust::exclusive_scan(thrust::hip::par(alloc),
thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data() + offset.size()),
thrust::device_pointer_cast(offset.data()));
#elif defined(XGBOOST_USE_CUDA)
thrust::exclusive_scan(thrust::cuda::par(alloc),
thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data() + offset.size()),
thrust::device_pointer_cast(offset.data()));
#if defined(XGBOOST_USE_CUDA)
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data() + offset.size()),
thrust::device_pointer_cast(offset.data()));
#elif defined(XGBOOST_USE_HIP)
thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data() + offset.size()),
thrust::device_pointer_cast(offset.data()));
#endif
}
template <typename AdapterBatchT>
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
SparsePage* page) {
bool valid = NoInfInData(batch, IsValidFunctor{missing});
CHECK(valid) << error::InfInData();

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2015-2022 by XGBoost Contributors
/**
* Copyright 2015-2023, XGBoost Contributors
* \file simple_dmatrix.h
* \brief In-memory version of DMatrix.
* \author Tianqi Chen
@@ -15,8 +15,7 @@
#include "gradient_index.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
// Used for single batch data.
class SimpleDMatrix : public DMatrix {
public:
@@ -65,9 +64,10 @@ class SimpleDMatrix : public DMatrix {
/**
* \brief Reindex the features based on a global view.
*
* In some cases (e.g. vertical federated learning), features are loaded locally with indices
* starting from 0. However, all the algorithms assume the features are globally indexed, so we
* reindex the features based on the offset needed to obtain the global view.
* In some cases (e.g. column-wise data split and vertical federated learning), features are
* loaded locally with indices starting from 0. However, all the algorithms assume the features
* are globally indexed, so we reindex the features based on the offset needed to obtain the
* global view.
*/
void ReindexFeatures(Context const* ctx);
@@ -75,6 +75,5 @@ class SimpleDMatrix : public DMatrix {
// Context used only for DMatrix initialization.
Context fmat_ctx_;
};
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_SIMPLE_DMATRIX_H_

View File

@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
ctx->Device());
} else {
CHECK(sparse_page_source_);
ellpack_page_source_->Reset();

View File

@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
} // namespace detail
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
auto device = proxy->DeviceIdx();
if (device < 0) {
device = dh::CurrentDevice();
auto device = proxy->Device();
if (device.IsCPU()) {
device = DeviceOrd::CUDA(dh::CurrentDevice());
}
CHECK_GE(device, 0);
CHECK(device.IsCUDA());
cuda_impl::Dispatch(proxy,
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });

View File

@@ -177,15 +177,15 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
}
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
// to let user adjust number of pre-fetched batches when needed.
uint32_t constexpr kPreFetch = 3;
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
std::int32_t n_prefetches = std::max(nthreads_, 3);
std::int32_t n_prefetch_batches =
std::min(static_cast<std::uint32_t>(n_prefetches), n_batches_);
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
std::size_t fetch_it = count_;
exce_.Rethrow();
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
fetch_it %= n_batches_; // ring
if (ring_->at(fetch_it).valid()) {
continue;

View File

@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
monitor_.Start("BoostNewTrees");
predt->predictions.SetDevice(ctx_->Ordinal());
predt->predictions.SetDevice(ctx_->Device());
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
model_.learner_model_param->OutputLength());
CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
} else {
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
ctx_->Ordinal()};
ctx_->Device()};
bool update_predict = true;
for (bst_target_t gid = 0; gid < n_groups; ++gid) {
node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {
PredictionCacheEntry predts; // temporary storage for prediction
if (ctx_->IsCUDA()) {
predts.predictions.SetDevice(ctx_->gpu_id);
predts.predictions.SetDevice(ctx_->Device());
}
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
// multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
size_t n_rows = p_fmat->Info().num_row_;
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
if (predts.predictions.Device().IsCUDA()) {
p_out_preds->predictions.SetDevice(predts.predictions.Device());
GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows, n_groups,
group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {
StringView msg{"Unsupported data type for inplace predict."};
PredictionCacheEntry predts;
if (ctx_->gpu_id != Context::kCpuId) {
predts.predictions.SetDevice(ctx_->gpu_id);
if (ctx_->IsCUDA()) {
predts.predictions.SetDevice(ctx_->Device());
}
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@@ -838,8 +838,8 @@ class Dart : public GBTree {
CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
size_t n_rows = p_fmat->Info().num_row_;
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
if (predts.predictions.Device().IsCUDA()) {
p_out_preds->predictions.SetDevice(predts.predictions.Device());
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,

View File

@@ -106,14 +106,30 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
Validate(*this);
}
namespace {
std::int32_t IOThreads(Context const* ctx) {
CHECK(ctx);
std::int32_t n_threads = ctx->Threads();
// CRAN checks for number of threads used by examples, but we might not have the right
// number of threads when serializing/unserializing models as nthread is a booster
// parameter, which is only effective after booster initialization.
//
// The threshold ratio of CPU time to user time for R is 2.5, we set the number of
// threads to 2.
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
n_threads = std::min(2, n_threads);
#endif
return n_threads;
}
} // namespace
void GBTreeModel::SaveModel(Json* p_out) const {
auto& out = *p_out;
CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
out["gbtree_model_param"] = ToJson(param);
std::vector<Json> trees_json(trees.size());
CHECK(ctx_);
common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
auto const& tree = trees[t];
Json jtree{Object{}};
tree->SaveModel(&jtree);
@@ -151,9 +167,7 @@ void GBTreeModel::LoadModel(Json const& in) {
CHECK_EQ(tree_info_json.size(), param.num_trees);
tree_info.resize(param.num_trees);
CHECK(ctx_);
common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
auto tree_id = get<Integer const>(trees_json[t]["id"]);
trees.at(tree_id).reset(new RegTree{});
trees[tree_id]->LoadModel(trees_json[t]);

View File

@@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
std::swap(base_score_, base_margin);
// Make sure read access everywhere for thread-safe prediction.
std::as_const(base_score_).HostView();
if (!ctx->IsCPU()) {
if (ctx->IsCUDA()) {
std::as_const(base_score_).View(ctx->Device());
}
CHECK(std::as_const(base_score_).Data()->HostCanRead());
@@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
// multi-class is not yet supported.
CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
if (device.IsCPU()) {
if (!device.IsCUDA()) {
// Make sure that we won't run into race condition.
CHECK(base_score_.Data()->HostCanRead());
return base_score_.HostView();
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
void LearnerModelParam::Copy(LearnerModelParam const& that) {
base_score_.Reshape(that.base_score_.Shape());
base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
base_score_.Data()->SetDevice(that.base_score_.Device());
base_score_.Data()->Copy(*that.base_score_.Data());
std::as_const(base_score_).HostView();
if (that.base_score_.DeviceIdx() != Context::kCpuId) {
if (!that.base_score_.Device().IsCPU()) {
std::as_const(base_score_).View(that.base_score_.Device());
}
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
if (p_fmat) {
auto const& info = p_fmat->Info();
info.Validate(Ctx()->Ordinal());
info.Validate(Ctx()->Device());
// We estimate it from input data.
linalg::Tensor<float, 1> base_score;
InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
monitor_.Init("Learner");
for (std::shared_ptr<DMatrix> const& d : cache) {
if (d) {
prediction_container_.Cache(d, Context::kCpuId);
prediction_container_.Cache(d, DeviceOrd::CPU());
}
}
}
@@ -1057,7 +1057,7 @@ class LearnerIO : public LearnerConfiguration {
? std::numeric_limits<float>::quiet_NaN()
: obj_->ProbToMargin(mparam_.base_score)},
{1},
Context::kCpuId},
DeviceOrd::CPU()},
obj_->Task(), tparam_.multi_strategy);
if (attributes_.find("objective") != attributes_.cend()) {
@@ -1282,7 +1282,7 @@ class LearnerImpl : public LearnerIO {
this->ValidateDMatrix(train.get(), true);
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
auto& predt = prediction_container_.Cache(train, ctx_.Device());
monitor_.Start("PredictRaw");
this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1312,7 +1312,7 @@ class LearnerImpl : public LearnerIO {
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
<< "The number of columns in gradient should be equal to the number of targets/classes in "
"the model.";
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
auto& predt = prediction_container_.Cache(train, ctx_.Device());
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
monitor_.Stop("BoostOneIter");
}
@@ -1330,17 +1330,19 @@ class LearnerImpl : public LearnerIO {
if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
auto config = obj_->DefaultMetricConfig();
metrics_.back()->LoadConfig(config);
if (!IsA<Null>(config)) {
metrics_.back()->LoadConfig(config);
}
metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
}
for (size_t i = 0; i < data_sets.size(); ++i) {
std::shared_ptr<DMatrix> m = data_sets[i];
auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
auto &predt = prediction_container_.Cache(m, ctx_.Device());
this->ValidateDMatrix(m.get(), false);
this->PredictRaw(m.get(), &predt, false, 0, 0);
auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
out.Resize(predt.predictions.Size());
out.Copy(predt.predictions);
@@ -1376,7 +1378,7 @@ class LearnerImpl : public LearnerIO {
} else if (pred_leaf) {
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
} else {
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
auto& prediction = prediction_container_.Cache(data, ctx_.Device());
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
// Copy the prediction cache to output prediction. out_preds comes from C API
out_preds->SetDevice(ctx_.Device());
@@ -1456,7 +1458,7 @@ class LearnerImpl : public LearnerIO {
void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
MetaInfo const& info = p_fmat->Info();
info.Validate(ctx_.gpu_id);
info.Validate(ctx_.Device());
if (is_training) {
CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)

View File

@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
}
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
if (ctx_->gpu_id < 0) return;
if (ctx_->IsCPU()) return;
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
@@ -60,8 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
return;
}
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
// The begin and end indices for the section of each column associated with
// this device
std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -135,7 +134,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
++group_idx) {
// Get gradient
auto grad = GradientPair(0, 0);
if (ctx_->gpu_id >= 0) {
if (ctx_->IsCUDA()) {
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
}
auto dbias = static_cast<float>(
@@ -144,7 +143,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
model->Bias()[group_idx] += dbias;
// Update residual
if (ctx_->gpu_id >= 0) {
if (ctx_->IsCUDA()) {
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
}
}
@@ -155,7 +154,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
bst_float &w = (*model)[fidx][group_idx];
// Get gradient
auto grad = GradientPair(0, 0);
if (ctx_->gpu_id >= 0) {
if (ctx_->IsCUDA()) {
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
}
auto dw = static_cast<float>(tparam_.learning_rate *
@@ -164,15 +163,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
tparam_.reg_lambda_denorm));
w += dw;
if (ctx_->gpu_id >= 0) {
if (ctx_->IsCUDA()) {
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
}
}
// This needs to be public because of the __device__ lambda.
GradientPair GetBiasGradient(int group_idx, int num_group) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) {
return idx * num_group + group_idx;
@@ -196,8 +194,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda.
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);

View File

@@ -23,8 +23,7 @@
#include "xgboost/linalg.h"
#include "xgboost/metric.h"
namespace xgboost {
namespace metric {
namespace xgboost::metric {
// tag the this file, used by force static link later.
DMLC_REGISTRY_FILE_TAG(auc);
/**
@@ -257,10 +256,10 @@ template <typename Curve>
class EvalAUC : public MetricNoCache {
double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
double auc {0};
if (ctx_->gpu_id != Context::kCpuId) {
preds.SetDevice(ctx_->gpu_id);
info.labels.SetDevice(ctx_->gpu_id);
info.weights_.SetDevice(ctx_->gpu_id);
if (ctx_->Device().IsCUDA()) {
preds.SetDevice(ctx_->Device());
info.labels.SetDevice(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
}
// We use the global size to handle empty dataset.
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
double auc{0};
uint32_t valid_groups = 0;
auto n_threads = ctx_->Threads();
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
std::tie(auc, valid_groups) =
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
} else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
double auc{0};
auto n_threads = ctx_->Threads();
CHECK_NE(n_classes, 0);
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
} else {
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
std::tuple<double, double, double>
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
double fp, tp, auc;
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
}
public:
char const* Name() const override {
[[nodiscard]] char const* Name() const override {
return "auc";
}
};
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
std::tuple<double, double, double>
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
double pr, re, auc;
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
std::tie(pr, re, auc) =
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
size_t n_classes) {
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
auto n_threads = this->ctx_->Threads();
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
} else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
double auc{0};
uint32_t valid_groups = 0;
auto n_threads = ctx_->Threads();
if (ctx_->gpu_id == Context::kCpuId) {
if (ctx_->IsCPU()) {
auto labels = info.labels.Data()->ConstHostSpan();
if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
}
public:
const char *Name() const override { return "aucpr"; }
[[nodiscard]] const char *Name() const override { return "aucpr"; }
};
XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
return {};
}
#endif
} // namespace metric
} // namespace xgboost
} // namespace xgboost::metric

View File

@@ -926,8 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::Span<float const> predts,
MetaInfo const &info,
std::shared_ptr<DeviceAUCCache> *p_cache) {
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
if (predts.empty()) {
return std::make_pair(0.0, static_cast<uint32_t>(0));
}

View File

@@ -46,7 +46,26 @@ template <typename Fn>
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
PackedReduceResult result;
auto labels = info.labels.View(ctx->Device());
if (ctx->IsCPU()) {
if (ctx->IsCUDA()) {
#if defined(XGBOOST_USE_CUDA)
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::counting_iterator<size_t> begin(0);
thrust::counting_iterator<size_t> end = begin + labels.Size();
result = thrust::transform_reduce(
thrust::cuda::par(alloc), begin, end,
[=] XGBOOST_DEVICE(size_t i) {
auto idx = linalg::UnravelIndex(i, labels.Shape());
auto sample_id = std::get<0>(idx);
auto target_id = std::get<1>(idx);
auto res = loss(i, sample_id, target_id);
float v{std::get<0>(res)}, wt{std::get<1>(res)};
return PackedReduceResult{v, wt};
},
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
} else {
auto n_threads = ctx->Threads();
std::vector<double> score_tloc(n_threads, 0.0);
std::vector<double> weight_tloc(n_threads, 0.0);
@@ -69,41 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
result = PackedReduceResult{residue_sum, weights_sum};
} else {
#if defined(XGBOOST_USE_CUDA)
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::counting_iterator<size_t> begin(0);
thrust::counting_iterator<size_t> end = begin + labels.Size();
result = thrust::transform_reduce(
thrust::cuda::par(alloc), begin, end,
[=] XGBOOST_DEVICE(size_t i) {
auto idx = linalg::UnravelIndex(i, labels.Shape());
auto sample_id = std::get<0>(idx);
auto target_id = std::get<1>(idx);
auto res = loss(i, sample_id, target_id);
float v{std::get<0>(res)}, wt{std::get<1>(res)};
return PackedReduceResult{v, wt};
},
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
#elif defined(XGBOOST_USE_HIP)
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::counting_iterator<size_t> begin(0);
thrust::counting_iterator<size_t> end = begin + labels.Size();
result = thrust::transform_reduce(
thrust::hip::par(alloc), begin, end,
[=] XGBOOST_DEVICE(size_t i) {
auto idx = linalg::UnravelIndex(i, labels.Shape());
auto sample_id = std::get<0>(idx);
auto target_id = std::get<1>(idx);
auto res = loss(i, sample_id, target_id);
float v{std::get<0>(res)}, wt{std::get<1>(res)};
return PackedReduceResult{v, wt};
},
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
}
return result;
}
@@ -201,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache {
CHECK_EQ(info.labels.Shape(0), info.num_row_);
auto labels = info.labels.View(ctx_->Device());
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
float slope = this->param_.huber_slope;
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
PackedReduceResult result =
@@ -367,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache {
}
auto labels = info.labels.View(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan());
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
preds.SetDevice(ctx_->Device());
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
auto d_policy = policy_;
auto result =

View File

@@ -149,24 +149,24 @@ class MultiClassMetricsReduction {
#endif // XGBOOST_USE_CUDA || defined(XGBOOST_USE_HIP)
PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result;
if (device < 0) {
if (device.IsCPU()) {
result =
CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
}
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
else { // NOLINT
device_ = tparam.gpu_id;
device_ = ctx.Device();
preds.SetDevice(device_);
labels.SetDevice(device_);
weights.SetDevice(device_);
dh::safe_cuda(cudaSetDevice(device_));
dh::safe_cuda(cudaSetDevice(device_.ordinal));
result = DeviceReduceMetrics(weights, labels, preds, n_class);
}
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -176,8 +176,8 @@ class MultiClassMetricsReduction {
private:
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
dh::PinnedMemory label_error_;
int device_{-1};
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
DeviceOrd device_{DeviceOrd::CPU()};
#endif // defined(XGBOOST_USE_CUDA)
};
/*!
@@ -198,7 +198,7 @@ struct EvalMClassBase : public MetricNoCache {
CHECK_GE(nclass, 1U)
<< "mlogloss and merror are only used for multi-class classification,"
<< " use logloss for binary classification";
int device = ctx_->gpu_id;
auto device = ctx_->Device();
auto result =
reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
dat[0] = result.Residue();

View File

@@ -41,7 +41,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
auto d_gptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
auto topk = p_cache->Param().TopK();
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -96,7 +96,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
}
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
auto d_group_ptr = p_cache->DataGroupPtr(ctx);

View File

@@ -148,19 +148,18 @@ class ElementWiseSurvivalMetricsReduction {
const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result;
if (ctx.gpu_id < 0) {
if (ctx.IsCPU()) {
result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
preds, ctx.Threads());
}
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
else { // NOLINT
preds.SetDevice(ctx.gpu_id);
labels_lower_bound.SetDevice(ctx.gpu_id);
labels_upper_bound.SetDevice(ctx.gpu_id);
weights.SetDevice(ctx.gpu_id);
dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
preds.SetDevice(ctx.Device());
labels_lower_bound.SetDevice(ctx.Device());
labels_upper_bound.SetDevice(ctx.Device());
weights.SetDevice(ctx.Device());
dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
}
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

View File

@@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
if (ctx->IsCPU()) {
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
predt, alpha, p_tree);
} else {
position.SetDevice(ctx->gpu_id);
if (ctx->IsCUDA()) {
position.SetDevice(ctx->Device());
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
predt, alpha, p_tree);
} else {
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
predt, alpha, p_tree);
}
}
} // namespace obj

View File

@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
template <typename Distribution>
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
bool is_null_weight, float aft_loss_distribution_scale) {
common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(ndata, 1);
const int device = ctx_->gpu_id;
const auto device = ctx_->Device();
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
const bool is_null_weight = info.weights_.Size() == 0;
if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
_preds[_idx] = exp(_preds[_idx]);
},
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2018-2022 by XGBoost Contributors
/**
* Copyright 2018-2023, XGBoost Contributors
* \file hinge.cc
* \brief Provides an implementation of the hinge loss function
* \author Henry Gouk
@@ -13,8 +13,7 @@
#include "../common/transform.h"
#include "../common/common.h"
namespace xgboost {
namespace obj {
namespace xgboost::obj {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
_out_gpair[_idx] = GradientPair(g, h);
},
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
ctx_->gpu_id).Eval(
ctx_->Device()).Eval(
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
}
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
_preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
},
common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}
const char* DefaultEvalMetric() const override {
[[nodiscard]] const char* DefaultEvalMetric() const override {
return "error";
}
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
.describe("Hinge loss. Expects labels to be in [0,1f]")
.set_body([]() { return new HingeObj(); });
} // namespace obj
} // namespace xgboost
} // namespace xgboost::obj

View File

@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
CheckInitInputs(info);
}
// Avoid altering any state in child objective.
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
Json config{Object{}};
this->SaveConfig(&config);

View File

@@ -103,19 +103,19 @@ class LambdaRankObj : public FitIntercept {
// Update position biased for unbiased click data
void UpdatePositionBias() {
li_full_.SetDevice(ctx_->gpu_id);
lj_full_.SetDevice(ctx_->gpu_id);
li_.SetDevice(ctx_->gpu_id);
lj_.SetDevice(ctx_->gpu_id);
li_full_.SetDevice(ctx_->Device());
lj_full_.SetDevice(ctx_->Device());
li_.SetDevice(ctx_->Device());
lj_.SetDevice(ctx_->Device());
if (ctx_->IsCPU()) {
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
} else {
if (ctx_->IsCUDA()) {
cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
} else {
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
}
li_full_.Data()->Fill(0.0);

View File

@@ -296,12 +296,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate
std::int32_t device_id = ctx->gpu_id;
dh::safe_cuda(cudaSetDevice(device_id));
auto device = ctx->Device();
dh::safe_cuda(cudaSetDevice(device.ordinal));
auto n_groups = p_cache->Groups();
info.labels.SetDevice(device_id);
preds.SetDevice(device_id);
info.labels.SetDevice(device);
preds.SetDevice(device);
out_gpair->SetDevice(ctx->Device());
out_gpair->Reshape(preds.Size(), 1);

View File

@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
const int nclass = param_.num_class;
const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
auto device = ctx_->gpu_id;
auto device = ctx_->Device();
out_gpair->SetDevice(device);
info.labels.SetDevice(device);
info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
const int nclass = param_.num_class;
const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
auto device = io_preds->DeviceIdx();
auto device = io_preds->Device();
if (prob) {
common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {

View File

@@ -70,16 +70,16 @@ class QuantileRegression : public ObjFunction {
out_gpair->Reshape(info.num_row_, n_targets);
auto gpair = out_gpair->View(ctx_->Device());
info.weights_.SetDevice(ctx_->gpu_id);
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
preds.SetDevice(ctx_->gpu_id);
preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds);
auto n_samples = info.num_row_;
alpha_.SetDevice(ctx_->gpu_id);
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
alpha_.SetDevice(ctx_->Device());
auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
linalg::ElementWiseKernel(
ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
@@ -103,11 +103,48 @@ class QuantileRegression : public ObjFunction {
CHECK(!alpha_.Empty());
auto n_targets = this->Targets(info);
base_score->SetDevice(ctx_->gpu_id);
base_score->SetDevice(ctx_->Device());
base_score->Reshape(n_targets);
double sw{0};
if (ctx_->IsCPU()) {
if (ctx_->IsCUDA()) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
alpha_.SetDevice(ctx_->Device());
auto d_alpha = alpha_.ConstDeviceSpan();
auto d_labels = info.labels.View(ctx_->Device());
auto seg_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
CHECK_EQ(d_labels.Shape(1), 1);
auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_labels(sample_idx, 0);
});
auto n = d_labels.Size() * d_alpha.size();
CHECK_EQ(base_score->Size(), d_alpha.size());
if (info.weights_.Empty()) {
common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
val_it + n, base_score->Data());
sw = info.num_row_;
} else {
info.weights_.SetDevice(ctx_->Device());
auto d_weights = info.weights_.ConstDeviceSpan();
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_weights[sample_idx];
});
common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
val_it, val_it + n, weight_it, weight_it + n,
base_score->Data());
sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
thrust::plus<double>{});
}
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
} else {
auto quantiles = base_score->HostView();
auto h_weights = info.weights_.ConstHostVector();
if (info.weights_.Empty()) {
@@ -127,43 +164,6 @@ class QuantileRegression : public ObjFunction {
linalg::cend(h_labels), std::cbegin(h_weights));
}
}
} else {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
alpha_.SetDevice(ctx_->gpu_id);
auto d_alpha = alpha_.ConstDeviceSpan();
auto d_labels = info.labels.View(ctx_->Device());
auto seg_it = dh::MakeTransformIterator<std::size_t>(
thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
CHECK_EQ(d_labels.Shape(1), 1);
auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_labels(sample_idx, 0);
});
auto n = d_labels.Size() * d_alpha.size();
CHECK_EQ(base_score->Size(), d_alpha.size());
if (info.weights_.Empty()) {
common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
val_it + n, base_score->Data());
sw = info.num_row_;
} else {
info.weights_.SetDevice(ctx_->gpu_id);
auto d_weights = info.weights_.ConstDeviceSpan();
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) {
auto sample_idx = i % d_labels.Shape(0);
return d_weights[sample_idx];
});
common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
val_it, val_it + n, weight_it, weight_it + n,
base_score->Data());
sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
thrust::plus<double>{});
}
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
}
// For multiple quantiles, we should extend the base score to a vector instead of

View File

@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
size_t const ndata = preds.Size();
out_gpair->SetDevice(ctx_->Device());
auto device = ctx_->gpu_id;
auto device = ctx_->Device();
bool is_null_weight = info.weights_.Size() == 0;
auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
additional_input_.HostVector().begin()[1] = is_null_weight;
const size_t nthreads = ctx_->Threads();
bool on_device = device >= 0;
bool on_device = device.IsCUDA();
// On CPU we run the transformation each thread processing a contigious block of data
// for better performance.
const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
_preds[_idx] = Loss::PredTransform(_preds[_idx]);
},
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}
@@ -246,16 +246,16 @@ class PseudoHuberRegression : public FitIntercept {
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
auto labels = info.labels.View(ctx_->Device());
out_gpair->SetDevice(ctx_->gpu_id);
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto gpair = out_gpair->View(ctx_->Device());
preds.SetDevice(ctx_->gpu_id);
preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds);
info.weights_.SetDevice(ctx_->gpu_id);
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
@@ -287,6 +287,13 @@ class PseudoHuberRegression : public FitIntercept {
}
FromJson(in["pseudo_huber_param"], &param_);
}
[[nodiscard]] Json DefaultMetricConfig() const override {
CHECK(param_.GetInitialised());
Json config{Object{}};
config["name"] = String{this->DefaultEvalMetric()};
config["pseudo_huber_param"] = ToJson(param_);
return config;
}
};
XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
@@ -320,7 +327,7 @@ class PoissonRegression : public FitIntercept {
size_t const ndata = preds.Size();
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto device = ctx_->gpu_id;
auto device = ctx_->Device();
label_correct_.Resize(1);
label_correct_.Fill(1);
@@ -362,7 +369,7 @@ class PoissonRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]);
},
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -505,7 +512,7 @@ class GammaRegression : public FitIntercept {
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
const size_t ndata = preds.Size();
auto device = ctx_->gpu_id;
auto device = ctx_->Device();
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
label_correct_.Resize(1);
@@ -548,7 +555,7 @@ class GammaRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]);
},
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -606,7 +613,7 @@ class TweedieRegression : public FitIntercept {
out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto device = ctx_->gpu_id;
auto device = ctx_->Device();
label_correct_.Resize(1);
label_correct_.Fill(1);
@@ -653,7 +660,7 @@ class TweedieRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]);
},
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx())
io_preds->Device())
.Eval(io_preds);
}
@@ -704,11 +711,11 @@ class MeanAbsoluteError : public ObjFunction {
out_gpair->Reshape(info.num_row_, this->Targets(info));
auto gpair = out_gpair->View(ctx_->Device());
preds.SetDevice(ctx_->gpu_id);
preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds);
info.weights_.SetDevice(ctx_->gpu_id);
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
auto sign = [](auto x) {

View File

@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
bst_feature_t num_features, bst_row_t num_rows,
size_t entry_start, float missing) :
batch{batch},
columns{num_features},
use_shared{use_shared},
is_valid{missing} {
extern __shared__ float _smem[];
smem = _smem;
if (use_shared) {
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
size_t shared_elements = blockDim.x * num_features;
dh::BlockFill(smem, shared_elements, nanf(""));
__syncthreads();
if (global_idx < num_rows) {
auto beg = global_idx * columns;
auto end = (global_idx + 1) * columns;
for (size_t i = beg; i < end; ++i) {
auto value = batch.GetElement(i).value;
if (is_valid(value)) {
smem[threadIdx.x * num_features + (i - beg)] = value;
}
size_t entry_start, float missing)
: batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
extern __shared__ float _smem[];
smem = _smem;
if (use_shared) {
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
size_t shared_elements = blockDim.x * num_features;
dh::BlockFill(smem, shared_elements, nanf(""));
__syncthreads();
if (global_idx < num_rows) {
auto beg = global_idx * columns;
auto end = (global_idx + 1) * columns;
for (size_t i = beg; i < end; ++i) {
auto value = batch.GetElement(i).value;
if (is_valid(value)) {
smem[threadIdx.x * num_features + (i - beg)] = value;
}
}
}
__syncthreads();
}
__syncthreads();
}
XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
if (use_shared) {
return smem[threadIdx.x * columns + fidx];
}
@@ -340,11 +337,11 @@ class DeviceModel {
size_t tree_end_; // NOLINT
int num_group;
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
dh::safe_cuda(cudaSetDevice(gpu_id));
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
// Copy decision trees to device
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
tree_segments = HostDeviceVector<size_t>({}, device);
auto& h_tree_segments = tree_segments.HostVector();
h_tree_segments.reserve((tree_end - tree_begin) + 1);
size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
h_tree_segments.push_back(sum);
}
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
auto d_nodes = nodes.DevicePointer();
auto d_stats = stats.DevicePointer();
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -370,12 +367,12 @@ class DeviceModel {
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
}
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
auto& h_tree_group = tree_group.HostVector();
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
// Initialize categorical splits.
split_types.SetDevice(gpu_id);
split_types.SetDevice(device);
std::vector<FeatureType>& h_split_types = split_types.HostVector();
h_split_types.resize(h_tree_segments.back());
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -384,8 +381,8 @@ class DeviceModel {
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
}
categories = HostDeviceVector<uint32_t>({}, gpu_id);
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
categories = HostDeviceVector<uint32_t>({}, device);
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
std::vector<uint32_t> &h_categories = categories.HostVector();
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -398,7 +395,7 @@ class DeviceModel {
}
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
h_tree_segments.back(), {}, gpu_id);
h_tree_segments.back(), {}, device);
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
categories_node_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -490,8 +487,8 @@ struct PathInfo {
void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
int gpu_id) {
dh::safe_cuda(cudaSetDevice(gpu_id));
DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -654,11 +651,12 @@ __global__ void MaskBitVectorKernel(
common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
// This needs to be always instantiated since the data is loaded cooperatively by all threads.
SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx >= num_rows) {
return;
}
SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
std::size_t tree_offset = 0;
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -689,10 +687,10 @@ __global__ void MaskBitVectorKernel(
}
}
__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
BitVector const& decision_bits,
BitVector const& missing_bits, std::size_t num_nodes,
std::size_t tree_offset) {
__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
BitVector const& decision_bits,
BitVector const& missing_bits, std::size_t num_nodes,
std::size_t tree_offset) {
bst_node_t nidx = 0;
RegTree::Node n = tree.d_tree[nidx];
while (!n.IsLeaf()) {
@@ -704,9 +702,19 @@ __device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
}
n = tree.d_tree[nidx];
}
return nidx;
}
__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
BitVector const& decision_bits,
BitVector const& missing_bits, std::size_t num_nodes,
std::size_t tree_offset) {
auto const nidx =
GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
return tree.d_tree[nidx].LeafValue();
}
template <bool predict_leaf>
__global__ void PredictByBitVectorKernel(
common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
@@ -722,27 +730,39 @@ __global__ void PredictByBitVectorKernel(
}
std::size_t tree_offset = 0;
if (num_group == 1) {
float sum = 0;
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
if constexpr (predict_leaf) {
for (size_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
TreeView d_tree{tree_begin, tree_idx, d_nodes,
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
d_cat_node_segments, d_categories};
sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
tree_offset);
auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
num_nodes, tree_offset);
d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
tree_offset += d_tree.d_tree.size();
}
d_out_predictions[row_idx] += sum;
} else {
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
auto const tree_group = d_tree_group[tree_idx];
TreeView d_tree{tree_begin, tree_idx, d_nodes,
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
d_cat_node_segments, d_categories};
bst_uint out_prediction_idx = row_idx * num_group + tree_group;
d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
tree_offset += d_tree.d_tree.size();
if (num_group == 1) {
float sum = 0;
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
TreeView d_tree{tree_begin, tree_idx, d_nodes,
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
d_cat_node_segments, d_categories};
sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
tree_offset);
tree_offset += d_tree.d_tree.size();
}
d_out_predictions[row_idx] += sum;
} else {
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
auto const tree_group = d_tree_group[tree_idx];
TreeView d_tree{tree_begin, tree_idx, d_nodes,
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
d_cat_node_segments, d_categories};
bst_uint out_prediction_idx = row_idx * num_group + tree_group;
d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
tree_offset += d_tree.d_tree.size();
}
}
}
}
@@ -754,21 +774,29 @@ class ColumnSplitHelper {
void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
model.learner_model_param->num_output_group);
PredictDMatrix<false>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
model.learner_model_param->num_output_group);
}
void PredictLeaf(DMatrix* dmat, HostDeviceVector<float>* out_preds, gbm::GBTreeModel const& model,
DeviceModel const& d_model) const {
CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
PredictDMatrix<true>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
model.learner_model_param->num_output_group);
}
private:
using BitType = BitVector::value_type;
template <bool predict_leaf>
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
bst_feature_t num_features, std::uint32_t num_group) const {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
dh::caching_device_vector<BitType> decision_storage{};
dh::caching_device_vector<BitType> missing_storage{};
auto constexpr kBlockThreads = 128;
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
auto const shared_memory_bytes =
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
auto const use_shared = shared_memory_bytes != 0;
@@ -781,8 +809,8 @@ class ColumnSplitHelper {
BitVector decision_bits{dh::ToSpan(decision_storage)};
BitVector missing_bits{dh::ToSpan(missing_storage)};
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->Device());
std::size_t entry_start = 0;
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
@@ -798,7 +826,7 @@ class ColumnSplitHelper {
AllReduceBitVectors(&decision_storage, &missing_storage);
dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
model.categories_tree_segments.ConstDeviceSpan(),
@@ -813,15 +841,14 @@ class ColumnSplitHelper {
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
dh::caching_device_vector<BitType>* missing_storage) const {
collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
collective::Synchronize(ctx_->gpu_id);
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
}
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
dh::caching_device_vector<BitType>* missing_storage,
std::size_t total_bits) const {
dh::caching_device_vector<BitType>* missing_storage,
std::size_t total_bits) const {
auto const size = BitVector::ComputeStorageSize(total_bits);
if (decision_storage->size() < size) {
decision_storage->resize(size);
@@ -844,12 +871,12 @@ class GPUPredictor : public xgboost::Predictor {
size_t num_features,
HostDeviceVector<bst_float>* predictions,
size_t batch_offset, bool is_dense) const {
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->Device());
const uint32_t BLOCK_THREADS = 128;
size_t num_rows = batch.Size();
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
bool use_shared = shared_memory_bytes != 0;
@@ -905,12 +932,12 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_end - tree_begin == 0) {
return;
}
out_preds->SetDevice(ctx_->gpu_id);
out_preds->SetDevice(ctx_->Device());
auto const& info = dmat->Info();
DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
d_model.Init(model, tree_begin, tree_end, ctx_->Device());
if (dmat->Info().IsColumnSplit()) {
if (info.IsColumnSplit()) {
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
return;
}
@@ -925,10 +952,10 @@ class GPUPredictor : public xgboost::Predictor {
} else {
size_t batch_offset = 0;
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal(
page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
d_model,
out_preds,
batch_offset);
@@ -942,16 +969,15 @@ class GPUPredictor : public xgboost::Predictor {
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
~GPUPredictor() override {
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
}
}
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
const gbm::GBTreeModel& model, uint32_t tree_begin,
uint32_t tree_end = 0) const override {
int device = ctx_->gpu_id;
CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
auto* out_preds = &predts->predictions;
if (tree_end == 0) {
tree_end = model.trees.size();
@@ -969,9 +995,9 @@ class GPUPredictor : public xgboost::Predictor {
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
<< "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
<< "but data is on: " << m->DeviceIdx();
CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
<< "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
<< "but data is on: " << m->Device().Name();
if (p_m) {
p_m->Info().num_row_ = m->NumRows();
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -980,16 +1006,16 @@ class GPUPredictor : public xgboost::Predictor {
info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model);
}
out_preds->predictions.SetDevice(m->DeviceIdx());
out_preds->predictions.SetDevice(m->Device());
const uint32_t BLOCK_THREADS = 128;
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
d_model.Init(model, tree_begin, tree_end, m->Device());
bool use_shared = shared_memory_bytes != 0;
size_t entry_start = 0;
@@ -1039,10 +1065,10 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
out_contribs->SetDevice(ctx_->gpu_id);
CHECK(!p_fmat->Info().IsColumnSplit())
<< "Predict contribution support for column-wise data split is not yet implemented.";
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
@@ -1060,12 +1086,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths;
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1074,7 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis));
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1099,10 +1125,8 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
out_contribs->SetDevice(ctx_->gpu_id);
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
@@ -1121,12 +1145,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths;
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1135,7 +1159,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis));
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1160,30 +1184,35 @@ class GPUPredictor : public xgboost::Predictor {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
const gbm::GBTreeModel &model,
unsigned tree_end) const override {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
const MetaInfo& info = p_fmat->Info();
bst_row_t num_rows = info.num_row_;
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
predictions->SetDevice(ctx_->Device());
predictions->Resize(num_rows * tree_end);
DeviceModel d_model;
d_model.Init(model, 0, tree_end, this->ctx_->Device());
if (info.IsColumnSplit()) {
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
return;
}
constexpr uint32_t kBlockThreads = 128;
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
info.num_col_, max_shared_memory_bytes);
bool use_shared = shared_memory_bytes != 0;
bst_feature_t num_features = info.num_col_;
bst_row_t num_rows = info.num_row_;
size_t entry_start = 0;
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
predictions->SetDevice(ctx_->gpu_id);
predictions->Resize(num_rows * tree_end);
DeviceModel d_model;
d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
if (p_fmat->PageExists<SparsePage>()) {
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
bst_row_t batch_offset = 0;
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature};
@@ -1208,7 +1237,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
bst_row_t batch_offset = 0;
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
size_t num_rows = batch.Size();
auto grid =
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1236,9 +1265,9 @@ class GPUPredictor : public xgboost::Predictor {
private:
/*! \brief Reconfigure the device when GPU is changed. */
static size_t ConfigureDevice(int device) {
if (device >= 0) {
return dh::MaxSharedMemory(device);
static size_t ConfigureDevice(DeviceOrd device) {
if (device.IsCUDA()) {
return dh::MaxSharedMemory(device.ordinal);
}
return 0;
}

View File

@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
if (ctx_->gpu_id >= 0) {
out_preds->SetDevice(ctx_->gpu_id);
if (ctx_->Device().IsCUDA()) {
out_preds->SetDevice(ctx_->Device());
}
if (!base_margin->Empty()) {
out_preds->Resize(n);

View File

@@ -19,8 +19,7 @@
#include "xgboost/linalg.h" // TensorView, Tensor, Constant
#include "xgboost/logging.h" // CHECK_EQ
namespace xgboost {
namespace tree {
namespace xgboost::tree {
namespace cpu_impl {
void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,13 +67,12 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out) {
out->SetDevice(ctx->gpu_id);
out->SetDevice(ctx->Device());
out->Reshape(n_targets);
gpair.SetDevice(ctx->Device());
auto gpair_t = gpair.View(ctx->Device());
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
: cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
}
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree

View File

@@ -21,9 +21,7 @@
#include "xgboost/logging.h" // CHECK_EQ
#include "xgboost/span.h" // span
namespace xgboost {
namespace tree {
namespace cuda_impl {
namespace xgboost::tree::cuda_impl {
void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
auto n_targets = out.Size();
@@ -56,7 +54,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
d_sum.Size() * 2);
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -65,6 +63,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
});
}
} // namespace cuda_impl
} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree::cuda_impl

Some files were not shown because too many files have changed in this diff Show More