enable ROCm on latest XGBoost
This commit is contained in:
@@ -11,10 +11,10 @@ set_source_files_properties(
|
||||
PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
|
||||
target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
|
||||
|
||||
if (USE_CUDA)
|
||||
if(USE_CUDA)
|
||||
file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
|
||||
target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
|
||||
endif (USE_CUDA)
|
||||
endif()
|
||||
|
||||
if (USE_HIP)
|
||||
file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
|
||||
@@ -27,9 +27,9 @@ target_include_directories(objxgboost
|
||||
${xgboost_SOURCE_DIR}/dmlc-core/include
|
||||
${xgboost_SOURCE_DIR}/rabit/include)
|
||||
|
||||
if (LOG_CAPI_INVOCATION)
|
||||
if(LOG_CAPI_INVOCATION)
|
||||
target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
|
||||
endif (LOG_CAPI_INVOCATION)
|
||||
endif()
|
||||
|
||||
# For MSVC: Call msvc_use_static_runtime() once again to completely
|
||||
# replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
|
||||
|
||||
@@ -271,8 +271,8 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
|
||||
if (cache_info != nullptr) {
|
||||
scache = cache_info;
|
||||
}
|
||||
xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
|
||||
XGBoostBatchCSR> adapter(data_handle, callback);
|
||||
xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> adapter(
|
||||
data_handle, callback);
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
*out = new std::shared_ptr<DMatrix> {
|
||||
DMatrix::Create(
|
||||
@@ -447,8 +447,11 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
|
||||
auto config = Json::Load(StringView{c_json_config});
|
||||
float missing = GetMissing(config);
|
||||
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
|
||||
auto data_split_mode =
|
||||
static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
|
||||
*out = new std::shared_ptr<DMatrix>(
|
||||
DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
|
||||
API_END();
|
||||
}
|
||||
|
||||
@@ -483,8 +486,11 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
|
||||
auto config = Json::Load(StringView{c_json_config});
|
||||
float missing = GetMissing(config);
|
||||
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
|
||||
auto data_split_mode =
|
||||
static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
|
||||
*out = new std::shared_ptr<DMatrix>(
|
||||
DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
|
||||
|
||||
API_END();
|
||||
}
|
||||
@@ -534,33 +540,8 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
|
||||
void *ptr_schema) {
|
||||
API_BEGIN();
|
||||
static_cast<data::RecordBatchesIterAdapter *>(data_handle)
|
||||
->SetData(static_cast<struct ArrowArray *>(ptr_array),
|
||||
static_cast<struct ArrowSchema *>(ptr_schema));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
|
||||
DMatrixHandle *out) {
|
||||
API_BEGIN();
|
||||
xgboost_CHECK_C_ARG_PTR(config);
|
||||
auto jconfig = Json::Load(StringView{config});
|
||||
auto missing = GetMissing(jconfig);
|
||||
auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
|
||||
auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
|
||||
data::RecordBatchesIterAdapter adapter(next, n_batches);
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
|
||||
const int* idxset,
|
||||
xgboost::bst_ulong len,
|
||||
DMatrixHandle* out) {
|
||||
XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
|
||||
DMatrixHandle *out) {
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
return XGDMatrixSliceDMatrixEx(handle, idxset, len, out, 0);
|
||||
}
|
||||
@@ -749,6 +730,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle const handle, xgboost::bst_ulon
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto p_m = CastDMatrixHandle(handle);
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
*out = static_cast<xgboost::bst_ulong>(p_m->Info().data_split_mode);
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
|
||||
xgboost::bst_ulong *out_indptr, unsigned *out_indices,
|
||||
float *out_data) {
|
||||
@@ -1375,29 +1365,6 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_len,
|
||||
const char **out_dptr) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
|
||||
auto *learner = static_cast<Learner*>(handle);
|
||||
std::string& raw_str = learner->GetThreadLocal().ret_str;
|
||||
raw_str.resize(0);
|
||||
|
||||
common::MemoryBufferStream fo(&raw_str);
|
||||
LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
|
||||
|
||||
learner->Configure();
|
||||
learner->SaveModel(&fo);
|
||||
|
||||
xgboost_CHECK_C_ARG_PTR(out_dptr);
|
||||
xgboost_CHECK_C_ARG_PTR(out_len);
|
||||
|
||||
*out_dptr = dmlc::BeginPtr(raw_str);
|
||||
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
|
||||
API_END();
|
||||
}
|
||||
|
||||
// The following two functions are `Load` and `Save` for memory based
|
||||
// serialization methods. E.g. Python pickle.
|
||||
XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, xgboost::bst_ulong *out_len,
|
||||
@@ -1432,36 +1399,13 @@ XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
|
||||
int* version) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto* bst = static_cast<Learner*>(handle);
|
||||
xgboost_CHECK_C_ARG_PTR(version);
|
||||
*version = rabit::LoadCheckPoint();
|
||||
if (*version != 0) {
|
||||
bst->Configure();
|
||||
}
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
auto *learner = static_cast<Learner *>(handle);
|
||||
learner->Configure();
|
||||
rabit::CheckPoint();
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
|
||||
int end_layer, int step,
|
||||
XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer, int end_layer, int step,
|
||||
BoosterHandle *out) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
xgboost_CHECK_C_ARG_PTR(out);
|
||||
|
||||
auto* learner = static_cast<Learner*>(handle);
|
||||
auto *learner = static_cast<Learner *>(handle);
|
||||
bool out_of_bound = false;
|
||||
auto p_out = learner->Slice(begin_layer, end_layer, step, &out_of_bound);
|
||||
if (out_of_bound) {
|
||||
@@ -1797,7 +1741,7 @@ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_FEDERATED)
|
||||
XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_key_path,
|
||||
XGB_DLL int XGBRunFederatedServer(int port, std::size_t world_size, char const *server_key_path,
|
||||
char const *server_cert_path, char const *client_cert_path) {
|
||||
API_BEGIN();
|
||||
federated::RunServer(port, world_size, server_key_path, server_cert_path, client_cert_path);
|
||||
@@ -1805,7 +1749,7 @@ XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_k
|
||||
}
|
||||
|
||||
// Run a server without SSL for local testing.
|
||||
XGB_DLL int XGBRunInsecureFederatedServer(int port, int world_size) {
|
||||
XGB_DLL int XGBRunInsecureFederatedServer(int port, std::size_t world_size) {
|
||||
API_BEGIN();
|
||||
federated::RunInsecureServer(port, world_size);
|
||||
API_END();
|
||||
|
||||
@@ -75,7 +75,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
|
||||
auto hess_dev = dh::CudaGetPointerDevice(hess.data);
|
||||
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
|
||||
auto &gpair = *out_gpair;
|
||||
gpair.SetDevice(grad_dev);
|
||||
gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
|
||||
gpair.Reshape(grad.Shape(0), grad.Shape(1));
|
||||
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
|
||||
auto cuctx = ctx->CUDACtx();
|
||||
@@ -153,7 +153,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
|
||||
if (learner->Ctx()->IsCUDA()) {
|
||||
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
|
||||
}
|
||||
p_predt->SetDevice(proxy->DeviceIdx());
|
||||
p_predt->SetDevice(proxy->Device());
|
||||
|
||||
auto &shape = learner->GetThreadLocal().prediction_shape;
|
||||
size_t n_samples = p_m->Info().num_row_;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2021-2023 by XGBoost Contributors
|
||||
* Copyright 2021-2023, XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_C_API_C_API_UTILS_H_
|
||||
#define XGBOOST_C_API_C_API_UTILS_H_
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <utility> // for move
|
||||
#include <vector>
|
||||
|
||||
#include "../common/json_utils.h" // for TypeCheck
|
||||
#include "xgboost/c_api.h"
|
||||
#include "xgboost/data.h" // DMatrix
|
||||
#include "xgboost/feature_map.h" // for FeatureMap
|
||||
@@ -254,28 +255,6 @@ inline void GenerateFeatureMap(Learner const *learner,
|
||||
|
||||
void XGBBuildInfoDevice(Json* p_info);
|
||||
|
||||
template <typename JT>
|
||||
auto const &RequiredArg(Json const &in, StringView key, StringView func) {
|
||||
auto const &obj = get<Object const>(in);
|
||||
auto it = obj.find(key);
|
||||
if (it == obj.cend() || IsA<Null>(it->second)) {
|
||||
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
|
||||
}
|
||||
TypeCheck<JT>(it->second, StringView{key});
|
||||
return get<std::remove_const_t<JT> const>(it->second);
|
||||
}
|
||||
|
||||
template <typename JT, typename T>
|
||||
auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
|
||||
auto const &obj = get<Object const>(in);
|
||||
auto it = obj.find(key);
|
||||
if (it != obj.cend() && !IsA<Null>(it->second)) {
|
||||
TypeCheck<JT>(it->second, key);
|
||||
return get<std::remove_const_t<JT> const>(it->second);
|
||||
}
|
||||
return dft;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get shared ptr from DMatrix C handle with additional checks.
|
||||
*/
|
||||
|
||||
@@ -15,8 +15,7 @@
|
||||
|
||||
#include "communicator-inl.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
namespace xgboost::collective {
|
||||
|
||||
/**
|
||||
* @brief Find the global sum of the given values across all workers.
|
||||
@@ -31,10 +30,9 @@ namespace collective {
|
||||
* @param size Number of values to sum.
|
||||
*/
|
||||
template <typename T>
|
||||
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
|
||||
void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
|
||||
if (info.IsRowSplit()) {
|
||||
collective::AllReduce<collective::Operation::kSum>(device, values, size);
|
||||
collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
|
||||
}
|
||||
}
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::collective
|
||||
|
||||
88
src/collective/allgather.cc
Normal file
88
src/collective/allgather.cc
Normal file
@@ -0,0 +1,88 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include "allgather.h"
|
||||
|
||||
#include <algorithm> // for min, copy_n
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int8_t, int32_t, int64_t
|
||||
#include <memory> // for shared_ptr
|
||||
#include <numeric> // for partial_sum
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "comm.h" // for Comm, Channel
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective::cpu_impl {
|
||||
Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
|
||||
std::int32_t worker_off, std::shared_ptr<Channel> prev_ch,
|
||||
std::shared_ptr<Channel> next_ch) {
|
||||
auto world = comm.World();
|
||||
auto rank = comm.Rank();
|
||||
CHECK_LT(worker_off, world);
|
||||
|
||||
for (std::int32_t r = 0; r < world; ++r) {
|
||||
auto send_rank = (rank + world - r + worker_off) % world;
|
||||
auto send_off = send_rank * segment_size;
|
||||
send_off = std::min(send_off, data.size_bytes());
|
||||
auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
|
||||
next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
|
||||
|
||||
auto recv_rank = (rank + world - r - 1 + worker_off) % world;
|
||||
auto recv_off = recv_rank * segment_size;
|
||||
recv_off = std::min(recv_off, data.size_bytes());
|
||||
auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
|
||||
prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
|
||||
auto rc = prev_ch->Block();
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return Success();
|
||||
}
|
||||
|
||||
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
|
||||
common::Span<std::int8_t const> data,
|
||||
common::Span<std::int8_t> erased_result) {
|
||||
auto world = comm.World();
|
||||
auto rank = comm.Rank();
|
||||
|
||||
auto prev = BootstrapPrev(rank, comm.World());
|
||||
auto next = BootstrapNext(rank, comm.World());
|
||||
|
||||
auto prev_ch = comm.Chan(prev);
|
||||
auto next_ch = comm.Chan(next);
|
||||
|
||||
// get worker offset
|
||||
std::vector<std::int64_t> offset(world + 1, 0);
|
||||
std::partial_sum(sizes.cbegin(), sizes.cend(), offset.begin() + 1);
|
||||
CHECK_EQ(*offset.cbegin(), 0);
|
||||
|
||||
// copy data
|
||||
auto current = erased_result.subspan(offset[rank], data.size_bytes());
|
||||
auto erased_data = EraseType(data);
|
||||
std::copy_n(erased_data.data(), erased_data.size(), current.data());
|
||||
|
||||
for (std::int32_t r = 0; r < world; ++r) {
|
||||
auto send_rank = (rank + world - r) % world;
|
||||
auto send_off = offset[send_rank];
|
||||
auto send_size = sizes[send_rank];
|
||||
auto send_seg = erased_result.subspan(send_off, send_size);
|
||||
next_ch->SendAll(send_seg);
|
||||
|
||||
auto recv_rank = (rank + world - r - 1) % world;
|
||||
auto recv_off = offset[recv_rank];
|
||||
auto recv_size = sizes[recv_rank];
|
||||
auto recv_seg = erased_result.subspan(recv_off, recv_size);
|
||||
prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
|
||||
|
||||
auto rc = prev_ch->Block();
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
return comm.Block();
|
||||
}
|
||||
} // namespace xgboost::collective::cpu_impl
|
||||
72
src/collective/allgather.h
Normal file
72
src/collective/allgather.h
Normal file
@@ -0,0 +1,72 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <memory> // for shared_ptr
|
||||
#include <numeric> // for accumulate
|
||||
#include <type_traits> // for remove_cv_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "comm.h" // for Comm, Channel, EraseType
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace cpu_impl {
|
||||
/**
|
||||
* @param worker_off Segment offset. For example, if the rank 2 worker specifis worker_off
|
||||
* = 1, then it owns the third segment.
|
||||
*/
|
||||
[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
|
||||
std::size_t segment_size, std::int32_t worker_off,
|
||||
std::shared_ptr<Channel> prev_ch,
|
||||
std::shared_ptr<Channel> next_ch);
|
||||
|
||||
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
|
||||
common::Span<std::int8_t const> data,
|
||||
common::Span<std::int8_t> erased_result);
|
||||
} // namespace cpu_impl
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
|
||||
auto n_bytes = sizeof(T) * size;
|
||||
auto erased = EraseType(data);
|
||||
|
||||
auto rank = comm.Rank();
|
||||
auto prev = BootstrapPrev(rank, comm.World());
|
||||
auto next = BootstrapNext(rank, comm.World());
|
||||
|
||||
auto prev_ch = comm.Chan(prev);
|
||||
auto next_ch = comm.Chan(next);
|
||||
auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
return comm.Block();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<T> data,
|
||||
std::vector<std::remove_cv_t<T>>* p_out) {
|
||||
auto world = comm.World();
|
||||
auto rank = comm.Rank();
|
||||
|
||||
std::vector<std::int64_t> sizes(world, 0);
|
||||
sizes[rank] = data.size_bytes();
|
||||
auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
std::vector<T>& result = *p_out;
|
||||
auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
|
||||
result.resize(n_total_bytes / sizeof(T));
|
||||
auto h_result = common::Span{result.data(), result.size()};
|
||||
auto erased_result = EraseType(h_result);
|
||||
auto erased_data = EraseType(data);
|
||||
|
||||
return cpu_impl::RingAllgatherV(comm, sizes, erased_data, erased_result);
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
90
src/collective/allreduce.cc
Normal file
90
src/collective/allreduce.cc
Normal file
@@ -0,0 +1,90 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include "allreduce.h"
|
||||
|
||||
#include <algorithm> // for min
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t, int8_t
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../data/array_interface.h" // for Type, DispatchDType
|
||||
#include "allgather.h" // for RingAllgather
|
||||
#include "comm.h" // for Comm
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective::cpu_impl {
|
||||
template <typename T>
|
||||
Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
|
||||
std::size_t n_bytes_in_seg, Func const& op) {
|
||||
auto rank = comm.Rank();
|
||||
auto world = comm.World();
|
||||
|
||||
auto dst_rank = BootstrapNext(rank, world);
|
||||
auto src_rank = BootstrapPrev(rank, world);
|
||||
auto next_ch = comm.Chan(dst_rank);
|
||||
auto prev_ch = comm.Chan(src_rank);
|
||||
|
||||
std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
|
||||
auto s_buf = common::Span{buffer.data(), buffer.size()};
|
||||
|
||||
for (std::int32_t r = 0; r < world - 1; ++r) {
|
||||
// send to ring next
|
||||
auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
|
||||
send_off = std::min(send_off, data.size_bytes());
|
||||
auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
|
||||
auto send_seg = data.subspan(send_off, seg_nbytes);
|
||||
|
||||
next_ch->SendAll(send_seg);
|
||||
|
||||
// receive from ring prev
|
||||
auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
|
||||
recv_off = std::min(recv_off, data.size_bytes());
|
||||
seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
|
||||
CHECK_EQ(seg_nbytes % sizeof(T), 0);
|
||||
auto recv_seg = data.subspan(recv_off, seg_nbytes);
|
||||
auto seg = s_buf.subspan(0, recv_seg.size());
|
||||
|
||||
prev_ch->RecvAll(seg);
|
||||
auto rc = prev_ch->Block();
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
// accumulate to recv_seg
|
||||
CHECK_EQ(seg.size(), recv_seg.size());
|
||||
op(seg, recv_seg);
|
||||
}
|
||||
|
||||
return Success();
|
||||
}
|
||||
|
||||
Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
|
||||
ArrayInterfaceHandler::Type type) {
|
||||
return DispatchDType(type, [&](auto t) {
|
||||
using T = decltype(t);
|
||||
// Divide the data into segments according to the number of workers.
|
||||
auto n_bytes_elem = sizeof(T);
|
||||
CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
|
||||
auto n = data.size_bytes() / n_bytes_elem;
|
||||
auto world = comm.World();
|
||||
auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
|
||||
auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
auto prev = BootstrapPrev(comm.Rank(), comm.World());
|
||||
auto next = BootstrapNext(comm.Rank(), comm.World());
|
||||
auto prev_ch = comm.Chan(prev);
|
||||
auto next_ch = comm.Chan(next);
|
||||
|
||||
rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
return comm.Block();
|
||||
});
|
||||
}
|
||||
} // namespace xgboost::collective::cpu_impl
|
||||
39
src/collective/allreduce.h
Normal file
39
src/collective/allreduce.h
Normal file
@@ -0,0 +1,39 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstdint> // for int8_t
|
||||
#include <functional> // for function
|
||||
#include <type_traits> // for is_invocable_v
|
||||
|
||||
#include "../data/array_interface.h" // for ArrayInterfaceHandler
|
||||
#include "comm.h" // for Comm, RestoreType
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace cpu_impl {
|
||||
using Func =
|
||||
std::function<void(common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out)>;
|
||||
|
||||
Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
|
||||
ArrayInterfaceHandler::Type type);
|
||||
} // namespace cpu_impl
|
||||
|
||||
template <typename T, typename Fn>
|
||||
std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>, Result> Allreduce(
|
||||
Comm const& comm, common::Span<T> data, Fn redop) {
|
||||
auto erased = EraseType(data);
|
||||
auto type = ToDType<T>::kType;
|
||||
|
||||
auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
|
||||
common::Span<std::int8_t> out) {
|
||||
CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
|
||||
auto lhs_t = RestoreType<T const>(lhs);
|
||||
auto rhs_t = RestoreType<T>(out);
|
||||
redop(lhs_t, rhs_t);
|
||||
};
|
||||
|
||||
return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
84
src/collective/broadcast.cc
Normal file
84
src/collective/broadcast.cc
Normal file
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include "broadcast.h"
|
||||
|
||||
#include <cmath> // for ceil, log2
|
||||
#include <cstdint> // for int32_t, int8_t
|
||||
#include <utility> // for move
|
||||
|
||||
#include "../common/bitfield.h" // for TrailingZeroBits, RBitField32
|
||||
#include "comm.h" // for Comm
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective::cpu_impl {
|
||||
namespace {
|
||||
std::int32_t ShiftedParentRank(std::int32_t shifted_rank, std::int32_t depth) {
|
||||
std::uint32_t mask{std::uint32_t{0} - 1}; // Oxff...
|
||||
RBitField32 maskbits{common::Span<std::uint32_t>{&mask, 1}};
|
||||
RBitField32 rankbits{
|
||||
common::Span<std::uint32_t>{reinterpret_cast<std::uint32_t*>(&shifted_rank), 1}};
|
||||
// prepare for counting trailing zeros.
|
||||
for (std::int32_t i = 0; i < depth + 1; ++i) {
|
||||
if (rankbits.Check(i)) {
|
||||
maskbits.Set(i);
|
||||
} else {
|
||||
maskbits.Clear(i);
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_NE(mask, 0);
|
||||
auto k = TrailingZeroBits(mask);
|
||||
auto shifted_parent = shifted_rank - (1 << k);
|
||||
return shifted_parent;
|
||||
}
|
||||
|
||||
// Shift the root node to rank 0
|
||||
std::int32_t ShiftLeft(std::int32_t rank, std::int32_t world, std::int32_t root) {
|
||||
auto shifted_rank = (rank + world - root) % world;
|
||||
return shifted_rank;
|
||||
}
|
||||
// shift back to the original rank
|
||||
std::int32_t ShiftRight(std::int32_t rank, std::int32_t world, std::int32_t root) {
|
||||
auto orig = (rank + root) % world;
|
||||
return orig;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root) {
|
||||
// Binomial tree broadcast
|
||||
// * Wiki
|
||||
// https://en.wikipedia.org/wiki/Broadcast_(parallel_pattern)#Binomial_Tree_Broadcast
|
||||
// * Impl
|
||||
// https://people.mpi-inf.mpg.de/~mehlhorn/ftp/NewToolbox/collective.pdf
|
||||
|
||||
auto rank = comm.Rank();
|
||||
auto world = comm.World();
|
||||
|
||||
// shift root to rank 0
|
||||
auto shifted_rank = ShiftLeft(rank, world, root);
|
||||
std::int32_t depth = std::ceil(std::log2(static_cast<double>(world))) - 1;
|
||||
|
||||
if (shifted_rank != 0) { // not root
|
||||
auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
|
||||
comm.Chan(parent)->RecvAll(data);
|
||||
auto rc = comm.Chan(parent)->Block();
|
||||
if (!rc.OK()) {
|
||||
return Fail("broadcast failed.", std::move(rc));
|
||||
}
|
||||
}
|
||||
|
||||
for (std::int32_t i = depth; i >= 0; --i) {
|
||||
CHECK_GE((i + 1), 0); // weird clang-tidy error that i might be negative
|
||||
if (shifted_rank % (1 << (i + 1)) == 0 && shifted_rank + (1 << i) < world) {
|
||||
auto sft_peer = shifted_rank + (1 << i);
|
||||
auto peer = ShiftRight(sft_peer, world, root);
|
||||
CHECK_NE(peer, root);
|
||||
comm.Chan(peer)->SendAll(data);
|
||||
}
|
||||
}
|
||||
|
||||
return comm.Block();
|
||||
}
|
||||
} // namespace xgboost::collective::cpu_impl
|
||||
26
src/collective/broadcast.h
Normal file
26
src/collective/broadcast.h
Normal file
@@ -0,0 +1,26 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstdint> // for int32_t, int8_t
|
||||
|
||||
#include "comm.h" // for Comm
|
||||
#include "xgboost/collective/result.h" // for
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective {
|
||||
namespace cpu_impl {
|
||||
Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief binomial tree broadcast is used on CPU with the default implementation.
|
||||
*/
|
||||
template <typename T>
|
||||
[[nodiscard]] Result Broadcast(Comm const& comm, common::Span<T> data, std::int32_t root) {
|
||||
auto n_total_bytes = data.size_bytes();
|
||||
auto erased =
|
||||
common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
|
||||
return cpu_impl::Broadcast(comm, erased, root);
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
304
src/collective/comm.cc
Normal file
304
src/collective/comm.cc
Normal file
@@ -0,0 +1,304 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include "comm.h"
|
||||
|
||||
#include <algorithm> // for copy
|
||||
#include <chrono> // for seconds
|
||||
#include <memory> // for shared_ptr
|
||||
#include <string> // for string
|
||||
#include <utility> // for move, forward
|
||||
|
||||
#include "allgather.h"
|
||||
#include "protocol.h" // for kMagic
|
||||
#include "xgboost/base.h" // for XGBOOST_STRICT_R_MODE
|
||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||
#include "xgboost/json.h" // for Json, Object
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
|
||||
namespace xgboost::collective {
|
||||
Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t retry, std::string task_id)
|
||||
: timeout_{timeout},
|
||||
retry_{retry},
|
||||
tracker_{host, port, -1},
|
||||
task_id_{std::move(task_id)},
|
||||
loop_{std::make_shared<Loop>(timeout)} {}
|
||||
|
||||
Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
|
||||
std::string const& task_id, TCPSocket* out, std::int32_t rank,
|
||||
std::int32_t world) {
|
||||
// get information from tracker
|
||||
CHECK(!info.host.empty());
|
||||
auto rc = Connect(info.host, info.port, retry, timeout, out);
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to connect to the tracker.", std::move(rc));
|
||||
}
|
||||
|
||||
TCPSocket& tracker = *out;
|
||||
return std::move(rc)
|
||||
<< [&] { return tracker.NonBlocking(false); }
|
||||
<< [&] { return tracker.RecvTimeout(timeout); }
|
||||
<< [&] { return proto::Magic{}.Verify(&tracker); }
|
||||
<< [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
|
||||
}
|
||||
|
||||
[[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
|
||||
return ConnectTrackerImpl(this->TrackerInfo(), this->Timeout(), this->retry_, this->task_id_, out,
|
||||
this->Rank(), this->World());
|
||||
}
|
||||
|
||||
[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
|
||||
proto::PeerInfo ninfo, std::chrono::seconds timeout,
|
||||
std::int32_t retry,
|
||||
std::vector<std::shared_ptr<TCPSocket>>* out_workers) {
|
||||
auto next = std::make_shared<TCPSocket>();
|
||||
auto prev = std::make_shared<TCPSocket>();
|
||||
|
||||
auto rc = Success() << [&] {
|
||||
auto rc = Connect(ninfo.host, ninfo.port, retry, timeout, next.get());
|
||||
if (!rc.OK()) {
|
||||
return Fail("Bootstrap failed to connect to ring next.", std::move(rc));
|
||||
}
|
||||
return rc;
|
||||
} << [&] {
|
||||
return next->NonBlocking(true);
|
||||
} << [&] {
|
||||
SockAddrV4 addr;
|
||||
return listener->Accept(prev.get(), &addr);
|
||||
} << [&] { return prev->NonBlocking(true); };
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
// exchange host name and port
|
||||
std::vector<std::int8_t> buffer(HOST_NAME_MAX * comm.World(), 0);
|
||||
auto s_buffer = common::Span{buffer.data(), buffer.size()};
|
||||
auto next_host = s_buffer.subspan(HOST_NAME_MAX * comm.Rank(), HOST_NAME_MAX);
|
||||
if (next_host.size() < ninfo.host.size()) {
|
||||
return Fail("Got an invalid host name.");
|
||||
}
|
||||
std::copy(ninfo.host.cbegin(), ninfo.host.cend(), next_host.begin());
|
||||
|
||||
auto prev_ch = std::make_shared<Channel>(comm, prev);
|
||||
auto next_ch = std::make_shared<Channel>(comm, next);
|
||||
|
||||
auto block = [&] {
|
||||
for (auto ch : {prev_ch, next_ch}) {
|
||||
auto rc = ch->Block();
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
return Success();
|
||||
};
|
||||
|
||||
rc = std::move(rc) << [&] {
|
||||
return cpu_impl::RingAllgather(comm, s_buffer, HOST_NAME_MAX, 0, prev_ch, next_ch);
|
||||
} << [&] { return block(); };
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to get host names from peers.", std::move(rc));
|
||||
}
|
||||
|
||||
std::vector<std::int32_t> peers_port(comm.World(), -1);
|
||||
peers_port[comm.Rank()] = ninfo.port;
|
||||
rc = std::move(rc) << [&] {
|
||||
auto s_ports = common::Span{reinterpret_cast<std::int8_t*>(peers_port.data()),
|
||||
peers_port.size() * sizeof(ninfo.port)};
|
||||
return cpu_impl::RingAllgather(comm, s_ports, sizeof(ninfo.port), 0, prev_ch, next_ch);
|
||||
} << [&] { return block(); };
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to get the port from peers.", std::move(rc));
|
||||
}
|
||||
|
||||
std::vector<proto::PeerInfo> peers(comm.World());
|
||||
for (auto r = 0; r < comm.World(); ++r) {
|
||||
auto nhost = s_buffer.subspan(HOST_NAME_MAX * r, HOST_NAME_MAX);
|
||||
auto nport = peers_port[r];
|
||||
auto nrank = BootstrapNext(r, comm.World());
|
||||
|
||||
peers[nrank] = {std::string{reinterpret_cast<char const*>(nhost.data())}, nport, nrank};
|
||||
}
|
||||
CHECK_EQ(peers[comm.Rank()].port, lport);
|
||||
for (auto const& p : peers) {
|
||||
CHECK_NE(p.port, -1);
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<TCPSocket>>& workers = *out_workers;
|
||||
workers.resize(comm.World());
|
||||
|
||||
for (std::int32_t r = (comm.Rank() + 1); r < comm.World(); ++r) {
|
||||
auto const& peer = peers[r];
|
||||
std::shared_ptr<TCPSocket> worker{TCPSocket::CreatePtr(comm.Domain())};
|
||||
rc = std::move(rc)
|
||||
<< [&] { return Connect(peer.host, peer.port, retry, timeout, worker.get()); }
|
||||
<< [&] { return worker->RecvTimeout(timeout); };
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
auto rank = comm.Rank();
|
||||
auto n_bytes = worker->SendAll(&rank, sizeof(comm.Rank()));
|
||||
if (n_bytes != sizeof(comm.Rank())) {
|
||||
return Fail("Failed to send rank.");
|
||||
}
|
||||
workers[r] = std::move(worker);
|
||||
}
|
||||
|
||||
for (std::int32_t r = 0; r < comm.Rank(); ++r) {
|
||||
SockAddrV4 addr;
|
||||
auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
|
||||
rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
|
||||
<< [&] { return peer->RecvTimeout(timeout); };
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
std::int32_t rank{-1};
|
||||
auto n_bytes = peer->RecvAll(&rank, sizeof(rank));
|
||||
if (n_bytes != sizeof(comm.Rank())) {
|
||||
return Fail("Failed to recv rank.");
|
||||
}
|
||||
workers[rank] = std::move(peer);
|
||||
}
|
||||
|
||||
for (std::int32_t r = 0; r < comm.World(); ++r) {
|
||||
if (r == comm.Rank()) {
|
||||
continue;
|
||||
}
|
||||
CHECK(workers[r]);
|
||||
}
|
||||
|
||||
return Success();
|
||||
}
|
||||
|
||||
RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t retry, std::string task_id)
|
||||
: Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
|
||||
auto rc = this->Bootstrap(timeout_, retry_, task_id_);
|
||||
CHECK(rc.OK()) << rc.Report();
|
||||
}
|
||||
|
||||
[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
||||
std::string task_id) {
|
||||
TCPSocket tracker;
|
||||
std::int32_t world{-1};
|
||||
auto rc = ConnectTrackerImpl(this->TrackerInfo(), timeout, retry, task_id, &tracker, this->Rank(),
|
||||
world);
|
||||
if (!rc.OK()) {
|
||||
return Fail("Bootstrap failed.", std::move(rc));
|
||||
}
|
||||
|
||||
this->domain_ = tracker.Domain();
|
||||
|
||||
// Start command
|
||||
TCPSocket listener = TCPSocket::Create(tracker.Domain());
|
||||
std::int32_t lport = listener.BindHost();
|
||||
listener.Listen();
|
||||
|
||||
// create worker for listening to error notice.
|
||||
auto domain = tracker.Domain();
|
||||
std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
|
||||
auto eport = error_sock->BindHost();
|
||||
error_sock->Listen();
|
||||
error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
|
||||
auto conn = error_sock->Accept();
|
||||
// On Windows accept returns an invalid socket after network is shutdown.
|
||||
if (conn.IsClosed()) {
|
||||
return;
|
||||
}
|
||||
LOG(WARNING) << "Another worker is running into error.";
|
||||
std::string scmd;
|
||||
conn.Recv(&scmd);
|
||||
auto jcmd = Json::Load(scmd);
|
||||
auto rc = this->Shutdown();
|
||||
if (!rc.OK()) {
|
||||
LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
|
||||
}
|
||||
#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
|
||||
exit(-1);
|
||||
#else
|
||||
LOG(FATAL) << rc.Report();
|
||||
#endif
|
||||
}};
|
||||
error_worker_.detach();
|
||||
|
||||
proto::Start start;
|
||||
rc = std::move(rc) << [&] { return start.WorkerSend(lport, &tracker, eport); }
|
||||
<< [&] { return start.WorkerRecv(&tracker, &world); };
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
this->world_ = world;
|
||||
|
||||
// get ring neighbors
|
||||
std::string snext;
|
||||
tracker.Recv(&snext);
|
||||
auto jnext = Json::Load(StringView{snext});
|
||||
|
||||
proto::PeerInfo ninfo{jnext};
|
||||
|
||||
// get the rank of this worker
|
||||
this->rank_ = BootstrapPrev(ninfo.rank, world);
|
||||
this->tracker_.rank = rank_;
|
||||
|
||||
std::vector<std::shared_ptr<TCPSocket>> workers;
|
||||
rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
CHECK(this->channels_.empty());
|
||||
for (auto& w : workers) {
|
||||
if (w) {
|
||||
w->SetNoDelay();
|
||||
rc = w->NonBlocking(true);
|
||||
}
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
RabitComm::~RabitComm() noexcept(false) {
|
||||
if (!IsDistributed()) {
|
||||
return;
|
||||
}
|
||||
auto rc = this->Shutdown();
|
||||
if (!rc.OK()) {
|
||||
LOG(WARNING) << rc.Report();
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] Result RabitComm::Shutdown() {
|
||||
TCPSocket tracker;
|
||||
return Success() << [&] {
|
||||
return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
|
||||
} << [&] {
|
||||
return this->Block();
|
||||
} << [&] {
|
||||
Json jcmd{Object{}};
|
||||
jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = tracker.Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Faled to send cmd.");
|
||||
}
|
||||
return Success();
|
||||
};
|
||||
}
|
||||
|
||||
[[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
|
||||
TCPSocket out;
|
||||
proto::Print print;
|
||||
return Success() << [&] { return this->ConnectTracker(&out); }
|
||||
<< [&] { return print.WorkerSend(&out, msg); };
|
||||
}
|
||||
|
||||
[[nodiscard]] Result RabitComm::SignalError(Result const& res) {
|
||||
TCPSocket out;
|
||||
return Success() << [&] { return this->ConnectTracker(&out); }
|
||||
<< [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
156
src/collective/comm.h
Normal file
156
src/collective/comm.h
Normal file
@@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <chrono> // for seconds
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t
|
||||
#include <memory> // for shared_ptr
|
||||
#include <string> // for string
|
||||
#include <thread> // for thread
|
||||
#include <type_traits> // for remove_const_t
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "loop.h" // for Loop
|
||||
#include "protocol.h" // for PeerInfo
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost::collective {
|
||||
|
||||
inline constexpr std::int32_t DefaultTimeoutSec() { return 300; } // 5min
|
||||
inline constexpr std::int32_t DefaultRetry() { return 3; }
|
||||
|
||||
// indexing into the ring
|
||||
inline std::int32_t BootstrapNext(std::int32_t r, std::int32_t world) {
|
||||
auto nrank = (r + world + 1) % world;
|
||||
return nrank;
|
||||
}
|
||||
|
||||
inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
|
||||
auto nrank = (r + world - 1) % world;
|
||||
return nrank;
|
||||
}
|
||||
|
||||
class Channel;
|
||||
|
||||
/**
|
||||
* @brief Base communicator storing info about the tracker and other communicators.
|
||||
*/
|
||||
class Comm {
|
||||
protected:
|
||||
std::int32_t world_{1};
|
||||
std::int32_t rank_{0};
|
||||
std::chrono::seconds timeout_{DefaultTimeoutSec()};
|
||||
std::int32_t retry_{DefaultRetry()};
|
||||
|
||||
proto::PeerInfo tracker_;
|
||||
SockDomain domain_{SockDomain::kV4};
|
||||
std::thread error_worker_;
|
||||
std::string task_id_;
|
||||
std::vector<std::shared_ptr<Channel>> channels_;
|
||||
std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
|
||||
DefaultTimeoutSec()}}}; // fixme: require federated comm to have a timeout
|
||||
|
||||
public:
|
||||
Comm() = default;
|
||||
Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, std::int32_t retry,
|
||||
std::string task_id);
|
||||
virtual ~Comm() noexcept(false) {} // NOLINT
|
||||
|
||||
Comm(Comm const& that) = delete;
|
||||
Comm& operator=(Comm const& that) = delete;
|
||||
Comm(Comm&& that) = delete;
|
||||
Comm& operator=(Comm&& that) = delete;
|
||||
|
||||
[[nodiscard]] auto TrackerInfo() const { return tracker_; }
|
||||
[[nodiscard]] Result ConnectTracker(TCPSocket* out) const;
|
||||
[[nodiscard]] auto Domain() const { return domain_; }
|
||||
[[nodiscard]] auto Timeout() const { return timeout_; }
|
||||
|
||||
[[nodiscard]] auto Rank() const { return rank_; }
|
||||
[[nodiscard]] auto World() const { return world_; }
|
||||
[[nodiscard]] bool IsDistributed() const { return World() > 1; }
|
||||
void Submit(Loop::Op op) const { loop_->Submit(op); }
|
||||
[[nodiscard]] Result Block() const { return loop_->Block(); }
|
||||
|
||||
[[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
|
||||
return channels_.at(rank);
|
||||
}
|
||||
[[nodiscard]] virtual bool IsFederated() const = 0;
|
||||
[[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
|
||||
|
||||
[[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
|
||||
};
|
||||
|
||||
class RabitComm : public Comm {
|
||||
[[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
|
||||
std::string task_id);
|
||||
[[nodiscard]] Result Shutdown();
|
||||
|
||||
public:
|
||||
// bootstrapping construction.
|
||||
RabitComm() = default;
|
||||
// ctor for testing where environment is known.
|
||||
RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t retry, std::string task_id);
|
||||
~RabitComm() noexcept(false) override;
|
||||
|
||||
[[nodiscard]] bool IsFederated() const override { return false; }
|
||||
[[nodiscard]] Result LogTracker(std::string msg) const override;
|
||||
|
||||
[[nodiscard]] Result SignalError(Result const&) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Communication channel between workers.
|
||||
*/
|
||||
class Channel {
|
||||
std::shared_ptr<TCPSocket> sock_{nullptr};
|
||||
Result rc_;
|
||||
Comm const& comm_;
|
||||
|
||||
public:
|
||||
explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
|
||||
: sock_{std::move(sock)}, comm_{comm} {}
|
||||
|
||||
void SendAll(std::int8_t const* ptr, std::size_t n) {
|
||||
Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
|
||||
CHECK(sock_.get());
|
||||
comm_.Submit(std::move(op));
|
||||
}
|
||||
void SendAll(common::Span<std::int8_t const> data) {
|
||||
this->SendAll(data.data(), data.size_bytes());
|
||||
}
|
||||
|
||||
void RecvAll(std::int8_t* ptr, std::size_t n) {
|
||||
Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
|
||||
CHECK(sock_.get());
|
||||
comm_.Submit(std::move(op));
|
||||
}
|
||||
void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
|
||||
|
||||
[[nodiscard]] auto Socket() const { return sock_; }
|
||||
[[nodiscard]] Result Block() { return comm_.Block(); }
|
||||
};
|
||||
|
||||
enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 };
|
||||
|
||||
template <typename T, typename U = std::conditional_t<std::is_const_v<T>,
|
||||
std::add_const_t<std::int8_t>, std::int8_t>>
|
||||
common::Span<U> EraseType(common::Span<T> data) {
|
||||
auto n_total_bytes = data.size_bytes();
|
||||
auto erased = common::Span{reinterpret_cast<std::add_pointer_t<U>>(data.data()), n_total_bytes};
|
||||
return erased;
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
common::Span<T> RestoreType(common::Span<U> data) {
|
||||
static_assert(std::is_same_v<std::remove_const_t<U>, std::int8_t>);
|
||||
auto n_total_bytes = data.size_bytes();
|
||||
auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
|
||||
return restored;
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
@@ -57,9 +57,7 @@ namespace collective {
|
||||
* - federated_client_key: Client key file path. Only needed for the SSL mode.
|
||||
* - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
|
||||
*/
|
||||
inline void Init(Json const& config) {
|
||||
Communicator::Init(config);
|
||||
}
|
||||
inline void Init(Json const &config) { Communicator::Init(config); }
|
||||
|
||||
/*!
|
||||
* \brief Finalize the collective communicator.
|
||||
@@ -141,17 +139,89 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gathers a single value all processes and distributes the result to all processes.
|
||||
*
|
||||
* @param input The single value.
|
||||
*/
|
||||
template <typename T>
|
||||
inline std::vector<T> Allgather(T const &input) {
|
||||
std::string_view str_input{reinterpret_cast<char const *>(&input), sizeof(T)};
|
||||
auto const output = Communicator::Get()->AllGather(str_input);
|
||||
CHECK_EQ(output.size() % sizeof(T), 0);
|
||||
std::vector<T> result(output.size() / sizeof(T));
|
||||
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gathers data from all processes and distributes it to all processes.
|
||||
*
|
||||
* This assumes all ranks have the same size, and input data has been sliced into the
|
||||
* corresponding position.
|
||||
* This assumes all ranks have the same size.
|
||||
*
|
||||
* @param send_receive_buffer Buffer storing the data.
|
||||
* @param size Size of the data in bytes.
|
||||
* @param input Buffer storing the data.
|
||||
*/
|
||||
inline void Allgather(void *send_receive_buffer, std::size_t size) {
|
||||
Communicator::Get()->AllGather(send_receive_buffer, size);
|
||||
template <typename T>
|
||||
inline std::vector<T> Allgather(std::vector<T> const &input) {
|
||||
if (input.empty()) {
|
||||
return input;
|
||||
}
|
||||
std::string_view str_input{reinterpret_cast<char const *>(input.data()),
|
||||
input.size() * sizeof(T)};
|
||||
auto const output = Communicator::Get()->AllGather(str_input);
|
||||
CHECK_EQ(output.size() % sizeof(T), 0);
|
||||
std::vector<T> result(output.size() / sizeof(T));
|
||||
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gathers variable-length data from all processes and distributes it to all processes.
|
||||
* @param input Buffer storing the data.
|
||||
*/
|
||||
template <typename T>
|
||||
inline std::vector<T> AllgatherV(std::vector<T> const &input) {
|
||||
std::string_view str_input{reinterpret_cast<char const *>(input.data()),
|
||||
input.size() * sizeof(T)};
|
||||
auto const output = Communicator::Get()->AllGatherV(str_input);
|
||||
CHECK_EQ(output.size() % sizeof(T), 0);
|
||||
std::vector<T> result(output.size() / sizeof(T));
|
||||
if (!output.empty()) {
|
||||
std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gathers variable-length strings from all processes and distributes them to all processes.
|
||||
* @param input Variable-length list of variable-length strings.
|
||||
*/
|
||||
inline std::vector<std::string> AllgatherStrings(std::vector<std::string> const &input) {
|
||||
std::size_t total_size{0};
|
||||
for (auto const &s : input) {
|
||||
total_size += s.length() + 1; // +1 for null-terminators
|
||||
}
|
||||
std::string flat_string;
|
||||
flat_string.reserve(total_size);
|
||||
for (auto const &s : input) {
|
||||
flat_string.append(s);
|
||||
flat_string.push_back('\0'); // Append a null-terminator after each string
|
||||
}
|
||||
|
||||
auto const output = Communicator::Get()->AllGatherV(flat_string);
|
||||
|
||||
std::vector<std::string> result;
|
||||
std::size_t start_index = 0;
|
||||
// Iterate through the output, find each null-terminated substring.
|
||||
for (std::size_t i = 0; i < output.size(); i++) {
|
||||
if (output[i] == '\0') {
|
||||
// Construct a std::string from the char* substring
|
||||
result.emplace_back(&output[start_index]);
|
||||
// Move to the next substring
|
||||
start_index = i + 1;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*!
|
||||
@@ -226,7 +296,7 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct AllgatherVResult {
|
||||
struct SpecialAllgatherVResult {
|
||||
std::vector<std::size_t> offsets;
|
||||
std::vector<std::size_t> sizes;
|
||||
std::vector<T> result;
|
||||
@@ -241,14 +311,10 @@ struct AllgatherVResult {
|
||||
* @param sizes Sizes of each input.
|
||||
*/
|
||||
template <typename T>
|
||||
inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
|
||||
std::vector<std::size_t> const &sizes) {
|
||||
auto num_inputs = sizes.size();
|
||||
|
||||
inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
|
||||
std::vector<std::size_t> const &sizes) {
|
||||
// Gather the sizes across all workers.
|
||||
std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
|
||||
std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
|
||||
collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
|
||||
auto const all_sizes = Allgather(sizes);
|
||||
|
||||
// Calculate input offsets (std::exclusive_scan).
|
||||
std::vector<std::size_t> offsets(all_sizes.size());
|
||||
@@ -257,11 +323,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
|
||||
}
|
||||
|
||||
// Gather all the inputs.
|
||||
auto total_input_size = offsets.back() + all_sizes.back();
|
||||
std::vector<T> all_inputs(total_input_size);
|
||||
std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
|
||||
// We cannot use allgather here, since each worker might have a different size.
|
||||
Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
|
||||
auto const all_inputs = AllgatherV(inputs);
|
||||
|
||||
return {offsets, all_sizes, all_inputs};
|
||||
}
|
||||
|
||||
@@ -11,9 +11,7 @@
|
||||
#include "../../plugin/federated/federated_communicator.h"
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
namespace xgboost::collective {
|
||||
thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
|
||||
thread_local CommunicatorType Communicator::type_{};
|
||||
|
||||
@@ -57,6 +55,4 @@ void Communicator::Finalize() {
|
||||
communicator_.reset(new NoOpCommunicator());
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -125,13 +125,17 @@ class Communicator {
|
||||
/**
|
||||
* @brief Gathers data from all processes and distributes it to all processes.
|
||||
*
|
||||
* This assumes all ranks have the same size, and input data has been sliced into the
|
||||
* corresponding position.
|
||||
* This assumes all ranks have the same size.
|
||||
*
|
||||
* @param send_receive_buffer Buffer storing the data.
|
||||
* @param size Size of the data in bytes.
|
||||
* @param input Buffer storing the data.
|
||||
*/
|
||||
virtual void AllGather(void *send_receive_buffer, std::size_t size) = 0;
|
||||
virtual std::string AllGather(std::string_view input) = 0;
|
||||
|
||||
/**
|
||||
* @brief Gathers variable-length data from all processes and distributes it to all processes.
|
||||
* @param input Buffer storing the data.
|
||||
*/
|
||||
virtual std::string AllGatherV(std::string_view input) = 0;
|
||||
|
||||
/**
|
||||
* @brief Combines values from all processes and distributes the result back to all processes.
|
||||
|
||||
@@ -40,12 +40,10 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal_));
|
||||
host_buffer_.resize(send_size * world_size_);
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
|
||||
cudaMemcpyDefault));
|
||||
Allgather(host_buffer_.data(), host_buffer_.size());
|
||||
dh::safe_cuda(
|
||||
cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
|
||||
host_buffer_.resize(send_size);
|
||||
dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_buffer, send_size, cudaMemcpyDefault));
|
||||
auto const output = Allgather(host_buffer_);
|
||||
dh::safe_cuda(cudaMemcpy(receive_buffer, output.data(), output.size(), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
|
||||
|
||||
@@ -60,11 +60,16 @@ class InMemoryCommunicator : public Communicator {
|
||||
bool IsDistributed() const override { return true; }
|
||||
bool IsFederated() const override { return false; }
|
||||
|
||||
void AllGather(void* in_out, std::size_t size) override {
|
||||
std::string AllGather(std::string_view input) override {
|
||||
std::string output;
|
||||
handler_.Allgather(static_cast<const char*>(in_out), size, &output, sequence_number_++,
|
||||
GetRank());
|
||||
output.copy(static_cast<char*>(in_out), size);
|
||||
handler_.Allgather(input.data(), input.size(), &output, sequence_number_++, GetRank());
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string AllGatherV(std::string_view input) override {
|
||||
std::string output;
|
||||
handler_.AllgatherV(input.data(), input.size(), &output, sequence_number_++, GetRank());
|
||||
return output;
|
||||
}
|
||||
|
||||
void AllReduce(void* in_out, std::size_t size, DataType data_type, Operation operation) override {
|
||||
|
||||
@@ -16,23 +16,49 @@ class AllgatherFunctor {
|
||||
public:
|
||||
std::string const name{"Allgather"};
|
||||
|
||||
AllgatherFunctor(int world_size, int rank) : world_size_{world_size}, rank_{rank} {}
|
||||
AllgatherFunctor(std::size_t world_size, std::size_t rank)
|
||||
: world_size_{world_size}, rank_{rank} {}
|
||||
|
||||
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
|
||||
if (buffer->empty()) {
|
||||
// Copy the input if this is the first request.
|
||||
buffer->assign(input, bytes);
|
||||
} else {
|
||||
// Splice the input into the common buffer.
|
||||
auto const per_rank = bytes / world_size_;
|
||||
auto const index = rank_ * per_rank;
|
||||
buffer->replace(index, per_rank, input + index, per_rank);
|
||||
// Resize the buffer if this is the first request.
|
||||
buffer->resize(bytes * world_size_);
|
||||
}
|
||||
|
||||
// Splice the input into the common buffer.
|
||||
buffer->replace(rank_ * bytes, bytes, input, bytes);
|
||||
}
|
||||
|
||||
private:
|
||||
std::size_t world_size_;
|
||||
std::size_t rank_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Functor for variable-length allgather.
|
||||
*/
|
||||
class AllgatherVFunctor {
|
||||
public:
|
||||
std::string const name{"AllgatherV"};
|
||||
|
||||
AllgatherVFunctor(std::size_t world_size, std::size_t rank,
|
||||
std::map<std::size_t, std::string_view>* data)
|
||||
: world_size_{world_size}, rank_{rank}, data_{data} {}
|
||||
|
||||
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
|
||||
data_->emplace(rank_, std::string_view{input, bytes});
|
||||
if (data_->size() == world_size_) {
|
||||
for (auto const& kv : *data_) {
|
||||
buffer->append(kv.second);
|
||||
}
|
||||
data_->clear();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int world_size_;
|
||||
int rank_;
|
||||
std::size_t world_size_;
|
||||
std::size_t rank_;
|
||||
std::map<std::size_t, std::string_view>* data_;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -154,7 +180,7 @@ class BroadcastFunctor {
|
||||
public:
|
||||
std::string const name{"Broadcast"};
|
||||
|
||||
BroadcastFunctor(int rank, int root) : rank_{rank}, root_{root} {}
|
||||
BroadcastFunctor(std::size_t rank, std::size_t root) : rank_{rank}, root_{root} {}
|
||||
|
||||
void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
|
||||
if (rank_ == root_) {
|
||||
@@ -164,11 +190,11 @@ class BroadcastFunctor {
|
||||
}
|
||||
|
||||
private:
|
||||
int rank_;
|
||||
int root_;
|
||||
std::size_t rank_;
|
||||
std::size_t root_;
|
||||
};
|
||||
|
||||
void InMemoryHandler::Init(int world_size, int) {
|
||||
void InMemoryHandler::Init(std::size_t world_size, std::size_t) {
|
||||
CHECK(world_size_ < world_size) << "In memory handler already initialized.";
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
@@ -178,7 +204,7 @@ void InMemoryHandler::Init(int world_size, int) {
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
|
||||
void InMemoryHandler::Shutdown(uint64_t sequence_number, std::size_t) {
|
||||
CHECK(world_size_ > 0) << "In memory handler already shutdown.";
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
@@ -194,24 +220,30 @@ void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
|
||||
}
|
||||
|
||||
void InMemoryHandler::Allgather(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank) {
|
||||
std::size_t sequence_number, std::size_t rank) {
|
||||
Handle(input, bytes, output, sequence_number, rank, AllgatherFunctor{world_size_, rank});
|
||||
}
|
||||
|
||||
void InMemoryHandler::AllgatherV(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, std::size_t rank) {
|
||||
Handle(input, bytes, output, sequence_number, rank, AllgatherVFunctor{world_size_, rank, &aux_});
|
||||
}
|
||||
|
||||
void InMemoryHandler::Allreduce(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank, DataType data_type,
|
||||
std::size_t sequence_number, std::size_t rank, DataType data_type,
|
||||
Operation op) {
|
||||
Handle(input, bytes, output, sequence_number, rank, AllreduceFunctor{data_type, op});
|
||||
}
|
||||
|
||||
void InMemoryHandler::Broadcast(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank, int root) {
|
||||
std::size_t sequence_number, std::size_t rank, std::size_t root) {
|
||||
Handle(input, bytes, output, sequence_number, rank, BroadcastFunctor{rank, root});
|
||||
}
|
||||
|
||||
template <class HandlerFunctor>
|
||||
void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank, HandlerFunctor const& functor) {
|
||||
std::size_t sequence_number, std::size_t rank,
|
||||
HandlerFunctor const& functor) {
|
||||
// Pass through if there is only 1 client.
|
||||
if (world_size_ == 1) {
|
||||
if (input != output->data()) {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
#include <condition_variable>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "communicator.h"
|
||||
@@ -31,7 +32,7 @@ class InMemoryHandler {
|
||||
*
|
||||
* This is used when the handler only needs to be initialized once with a known world size.
|
||||
*/
|
||||
explicit InMemoryHandler(int worldSize) : world_size_{worldSize} {}
|
||||
explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}
|
||||
|
||||
/**
|
||||
* @brief Initialize the handler with the world size and rank.
|
||||
@@ -41,7 +42,7 @@ class InMemoryHandler {
|
||||
* This is used when multiple objects/threads are accessing the same handler and need to
|
||||
* initialize it collectively.
|
||||
*/
|
||||
void Init(int world_size, int rank);
|
||||
void Init(std::size_t world_size, std::size_t rank);
|
||||
|
||||
/**
|
||||
* @brief Shut down the handler.
|
||||
@@ -51,7 +52,7 @@ class InMemoryHandler {
|
||||
* This is used when multiple objects/threads are accessing the same handler and need to
|
||||
* shut it down collectively.
|
||||
*/
|
||||
void Shutdown(uint64_t sequence_number, int rank);
|
||||
void Shutdown(uint64_t sequence_number, std::size_t rank);
|
||||
|
||||
/**
|
||||
* @brief Perform allgather.
|
||||
@@ -62,7 +63,18 @@ class InMemoryHandler {
|
||||
* @param rank Index of the worker.
|
||||
*/
|
||||
void Allgather(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank);
|
||||
std::size_t sequence_number, std::size_t rank);
|
||||
|
||||
/**
|
||||
* @brief Perform variable-length allgather.
|
||||
* @param input The input buffer.
|
||||
* @param bytes Number of bytes in the input buffer.
|
||||
* @param output The output buffer.
|
||||
* @param sequence_number Call sequence number.
|
||||
* @param rank Index of the worker.
|
||||
*/
|
||||
void AllgatherV(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, std::size_t rank);
|
||||
|
||||
/**
|
||||
* @brief Perform allreduce.
|
||||
@@ -75,7 +87,7 @@ class InMemoryHandler {
|
||||
* @param op The reduce operation.
|
||||
*/
|
||||
void Allreduce(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank, DataType data_type, Operation op);
|
||||
std::size_t sequence_number, std::size_t rank, DataType data_type, Operation op);
|
||||
|
||||
/**
|
||||
* @brief Perform broadcast.
|
||||
@@ -87,7 +99,7 @@ class InMemoryHandler {
|
||||
* @param root Index of the worker to broadcast from.
|
||||
*/
|
||||
void Broadcast(char const* input, std::size_t bytes, std::string* output,
|
||||
std::size_t sequence_number, int rank, int root);
|
||||
std::size_t sequence_number, std::size_t rank, std::size_t root);
|
||||
|
||||
private:
|
||||
/**
|
||||
@@ -102,15 +114,16 @@ class InMemoryHandler {
|
||||
*/
|
||||
template <class HandlerFunctor>
|
||||
void Handle(char const* input, std::size_t size, std::string* output, std::size_t sequence_number,
|
||||
int rank, HandlerFunctor const& functor);
|
||||
std::size_t rank, HandlerFunctor const& functor);
|
||||
|
||||
int world_size_{}; /// Number of workers.
|
||||
int received_{}; /// Number of calls received with the current sequence.
|
||||
int sent_{}; /// Number of calls completed with the current sequence.
|
||||
std::string buffer_{}; /// A shared common buffer.
|
||||
uint64_t sequence_number_{}; /// Call sequence number.
|
||||
mutable std::mutex mutex_; /// Lock.
|
||||
mutable std::condition_variable cv_; /// Conditional variable to wait on.
|
||||
std::size_t world_size_{}; /// Number of workers.
|
||||
std::size_t received_{}; /// Number of calls received with the current sequence.
|
||||
std::size_t sent_{}; /// Number of calls completed with the current sequence.
|
||||
std::string buffer_{}; /// A shared common buffer.
|
||||
std::map<std::size_t, std::string_view> aux_{}; /// A shared auxiliary map.
|
||||
uint64_t sequence_number_{}; /// Call sequence number.
|
||||
mutable std::mutex mutex_; /// Lock.
|
||||
mutable std::condition_variable cv_; /// Conditional variable to wait on.
|
||||
};
|
||||
|
||||
} // namespace collective
|
||||
|
||||
167
src/collective/loop.cc
Normal file
167
src/collective/loop.cc
Normal file
@@ -0,0 +1,167 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#include "loop.h"
|
||||
|
||||
#include <queue> // for queue
|
||||
|
||||
#include "rabit/internal/socket.h" // for PollHelper
|
||||
#include "xgboost/collective/socket.h" // for FailWithCode
|
||||
#include "xgboost/logging.h" // for CHECK
|
||||
|
||||
namespace xgboost::collective {
|
||||
Result Loop::EmptyQueue() {
|
||||
timer_.Start(__func__);
|
||||
auto error = [this] {
|
||||
this->stop_ = true;
|
||||
timer_.Stop(__func__);
|
||||
};
|
||||
|
||||
while (!queue_.empty() && !stop_) {
|
||||
std::queue<Op> qcopy;
|
||||
rabit::utils::PollHelper poll;
|
||||
|
||||
// watch all ops
|
||||
while (!queue_.empty()) {
|
||||
auto op = queue_.front();
|
||||
queue_.pop();
|
||||
|
||||
switch (op.code) {
|
||||
case Op::kRead: {
|
||||
poll.WatchRead(*op.sock);
|
||||
break;
|
||||
}
|
||||
case Op::kWrite: {
|
||||
poll.WatchWrite(*op.sock);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
error();
|
||||
return Fail("Invalid socket operation.");
|
||||
}
|
||||
}
|
||||
qcopy.push(op);
|
||||
}
|
||||
|
||||
// poll, work on fds that are ready.
|
||||
timer_.Start("poll");
|
||||
auto rc = poll.Poll(timeout_);
|
||||
timer_.Stop("poll");
|
||||
if (!rc.OK()) {
|
||||
error();
|
||||
return rc;
|
||||
}
|
||||
// we wonldn't be here if the queue is empty.
|
||||
CHECK(!qcopy.empty());
|
||||
|
||||
while (!qcopy.empty() && !stop_) {
|
||||
auto op = qcopy.front();
|
||||
qcopy.pop();
|
||||
|
||||
std::int32_t n_bytes_done{0};
|
||||
CHECK(op.sock->NonBlocking());
|
||||
|
||||
switch (op.code) {
|
||||
case Op::kRead: {
|
||||
if (poll.CheckRead(*op.sock)) {
|
||||
n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Op::kWrite: {
|
||||
if (poll.CheckWrite(*op.sock)) {
|
||||
n_bytes_done = op.sock->Send(op.ptr + op.off, op.n - op.off);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
error();
|
||||
return Fail("Invalid socket operation.");
|
||||
}
|
||||
}
|
||||
|
||||
if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
|
||||
stop_ = true;
|
||||
auto rc = system::FailWithCode("Invalid socket output.");
|
||||
error();
|
||||
return rc;
|
||||
}
|
||||
op.off += n_bytes_done;
|
||||
CHECK_LE(op.off, op.n);
|
||||
|
||||
if (op.off != op.n) {
|
||||
// not yet finished, push back to queue for next round.
|
||||
queue_.push(op);
|
||||
}
|
||||
}
|
||||
}
|
||||
timer_.Stop(__func__);
|
||||
return Success();
|
||||
}
|
||||
|
||||
void Loop::Process() {
|
||||
// consumer
|
||||
while (true) {
|
||||
std::unique_lock lock{mu_};
|
||||
cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
|
||||
if (stop_) {
|
||||
break;
|
||||
}
|
||||
CHECK(!mu_.try_lock());
|
||||
|
||||
this->rc_ = this->EmptyQueue();
|
||||
if (!rc_.OK()) {
|
||||
stop_ = true;
|
||||
cv_.notify_one();
|
||||
break;
|
||||
}
|
||||
|
||||
CHECK(queue_.empty());
|
||||
CHECK(!mu_.try_lock());
|
||||
cv_.notify_one();
|
||||
}
|
||||
|
||||
if (rc_.OK()) {
|
||||
CHECK(queue_.empty());
|
||||
}
|
||||
}
|
||||
|
||||
Result Loop::Stop() {
|
||||
std::unique_lock lock{mu_};
|
||||
stop_ = true;
|
||||
lock.unlock();
|
||||
|
||||
CHECK_EQ(this->Block().OK(), this->rc_.OK());
|
||||
|
||||
if (curr_exce_) {
|
||||
std::rethrow_exception(curr_exce_);
|
||||
}
|
||||
|
||||
return Success();
|
||||
}
|
||||
|
||||
Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
|
||||
timer_.Init(__func__);
|
||||
worker_ = std::thread{[this] {
|
||||
try {
|
||||
this->Process();
|
||||
} catch (std::exception const& e) {
|
||||
std::lock_guard<std::mutex> guard{mu_};
|
||||
if (!curr_exce_) {
|
||||
curr_exce_ = std::current_exception();
|
||||
rc_ = Fail("Exception was thrown");
|
||||
}
|
||||
stop_ = true;
|
||||
cv_.notify_all();
|
||||
} catch (...) {
|
||||
std::lock_guard<std::mutex> guard{mu_};
|
||||
if (!curr_exce_) {
|
||||
curr_exce_ = std::current_exception();
|
||||
rc_ = Fail("Exception was thrown");
|
||||
}
|
||||
stop_ = true;
|
||||
cv_.notify_all();
|
||||
}
|
||||
}};
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
83
src/collective/loop.h
Normal file
83
src/collective/loop.h
Normal file
@@ -0,0 +1,83 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <chrono> // for seconds
|
||||
#include <condition_variable> // for condition_variable
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int8_t, int32_t
|
||||
#include <exception> // for exception_ptr
|
||||
#include <mutex> // for unique_lock, mutex
|
||||
#include <queue> // for queue
|
||||
#include <thread> // for thread
|
||||
#include <utility> // for move
|
||||
|
||||
#include "../common/timer.h" // for Monitor
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||
|
||||
namespace xgboost::collective {
|
||||
class Loop {
|
||||
public:
|
||||
struct Op {
|
||||
enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
|
||||
std::int32_t rank{-1};
|
||||
std::int8_t* ptr{nullptr};
|
||||
std::size_t n{0};
|
||||
TCPSocket* sock{nullptr};
|
||||
std::size_t off{0};
|
||||
|
||||
Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
|
||||
: code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
|
||||
Op(Op const&) = default;
|
||||
Op& operator=(Op const&) = default;
|
||||
Op(Op&&) = default;
|
||||
Op& operator=(Op&&) = default;
|
||||
};
|
||||
|
||||
private:
|
||||
std::thread worker_;
|
||||
std::condition_variable cv_;
|
||||
std::mutex mu_;
|
||||
std::queue<Op> queue_;
|
||||
std::chrono::seconds timeout_;
|
||||
Result rc_;
|
||||
bool stop_{false};
|
||||
std::exception_ptr curr_exce_{nullptr};
|
||||
common::Monitor timer_;
|
||||
|
||||
Result EmptyQueue();
|
||||
void Process();
|
||||
|
||||
public:
|
||||
Result Stop();
|
||||
|
||||
void Submit(Op op) {
|
||||
// producer
|
||||
std::unique_lock lock{mu_};
|
||||
queue_.push(op);
|
||||
lock.unlock();
|
||||
cv_.notify_one();
|
||||
}
|
||||
|
||||
[[nodiscard]] Result Block() {
|
||||
{
|
||||
std::unique_lock lock{mu_};
|
||||
cv_.notify_all();
|
||||
}
|
||||
std::unique_lock lock{mu_};
|
||||
cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
|
||||
return std::move(rc_);
|
||||
}
|
||||
|
||||
explicit Loop(std::chrono::seconds timeout);
|
||||
|
||||
~Loop() noexcept(false) {
|
||||
this->Stop();
|
||||
|
||||
if (worker_.joinable()) {
|
||||
worker_.join();
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::collective
|
||||
@@ -17,10 +17,11 @@ class NoOpCommunicator : public Communicator {
|
||||
NoOpCommunicator() : Communicator(1, 0) {}
|
||||
bool IsDistributed() const override { return false; }
|
||||
bool IsFederated() const override { return false; }
|
||||
void AllGather(void *, std::size_t) override {}
|
||||
std::string AllGather(std::string_view) override { return {}; }
|
||||
std::string AllGatherV(std::string_view) override { return {}; }
|
||||
void AllReduce(void *, std::size_t, DataType, Operation) override {}
|
||||
void Broadcast(void *, std::size_t, int) override {}
|
||||
std::string GetProcessorName() override { return ""; }
|
||||
std::string GetProcessorName() override { return {}; }
|
||||
void Print(const std::string &message) override { LOG(CONSOLE) << message; }
|
||||
|
||||
protected:
|
||||
|
||||
214
src/collective/protocol.h
Normal file
214
src/collective/protocol.h
Normal file
@@ -0,0 +1,214 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstdint> // for int32_t
|
||||
#include <string> // for string
|
||||
#include <utility> // for move
|
||||
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||
#include "xgboost/json.h" // for Json
|
||||
|
||||
namespace xgboost::collective::proto {
|
||||
struct PeerInfo {
|
||||
std::string host;
|
||||
std::int32_t port{-1};
|
||||
std::int32_t rank{-1};
|
||||
|
||||
PeerInfo() = default;
|
||||
PeerInfo(std::string host, std::int32_t port, std::int32_t rank)
|
||||
: host{std::move(host)}, port{port}, rank{rank} {}
|
||||
|
||||
explicit PeerInfo(Json const& peer)
|
||||
: host{get<String>(peer["host"])},
|
||||
port{static_cast<std::int32_t>(get<Integer const>(peer["port"]))},
|
||||
rank{static_cast<std::int32_t>(get<Integer const>(peer["rank"]))} {}
|
||||
|
||||
[[nodiscard]] Json ToJson() const {
|
||||
Json info{Object{}};
|
||||
info["rank"] = rank;
|
||||
info["host"] = String{host};
|
||||
info["port"] = Integer{port};
|
||||
return info;
|
||||
}
|
||||
|
||||
[[nodiscard]] auto HostPort() const { return host + ":" + std::to_string(this->port); }
|
||||
};
|
||||
|
||||
struct Magic {
|
||||
static constexpr std::int32_t kMagic = 0xff99;
|
||||
|
||||
[[nodiscard]] Result Verify(xgboost::collective::TCPSocket* p_sock) {
|
||||
std::int32_t magic{kMagic};
|
||||
auto n_bytes = p_sock->SendAll(&magic, sizeof(magic));
|
||||
if (n_bytes != sizeof(magic)) {
|
||||
return Fail("Failed to verify.");
|
||||
}
|
||||
|
||||
magic = 0;
|
||||
n_bytes = p_sock->RecvAll(&magic, sizeof(magic));
|
||||
if (n_bytes != sizeof(magic)) {
|
||||
return Fail("Failed to verify.");
|
||||
}
|
||||
if (magic != kMagic) {
|
||||
return xgboost::collective::Fail("Invalid verification number.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
};
|
||||
|
||||
enum class CMD : std::int32_t {
|
||||
kInvalid = 0,
|
||||
kStart = 1,
|
||||
kShutdown = 2,
|
||||
kError = 3,
|
||||
kPrint = 4,
|
||||
};
|
||||
|
||||
struct Connect {
|
||||
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::int32_t world, std::int32_t rank,
|
||||
std::string task_id) const {
|
||||
Json jinit{Object{}};
|
||||
jinit["world_size"] = Integer{world};
|
||||
jinit["rank"] = Integer{rank};
|
||||
jinit["task_id"] = String{task_id};
|
||||
std::string msg;
|
||||
Json::Dump(jinit, &msg);
|
||||
auto n_bytes = tracker->Send(msg);
|
||||
if (n_bytes != msg.size()) {
|
||||
return Fail("Failed to send init command from worker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
|
||||
std::string* task_id) const {
|
||||
std::string init;
|
||||
sock->Recv(&init);
|
||||
auto jinit = Json::Load(StringView{init});
|
||||
*world = get<Integer const>(jinit["world_size"]);
|
||||
*rank = get<Integer const>(jinit["rank"]);
|
||||
*task_id = get<String const>(jinit["task_id"]);
|
||||
return Success();
|
||||
}
|
||||
};
|
||||
|
||||
class Start {
|
||||
private:
|
||||
[[nodiscard]] Result TrackerSend(std::int32_t world, TCPSocket* worker) const {
|
||||
Json jcmd{Object{}};
|
||||
jcmd["world_size"] = Integer{world};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = worker->Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Failed to send init command from tracker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
|
||||
public:
|
||||
[[nodiscard]] Result WorkerSend(std::int32_t lport, TCPSocket* tracker,
|
||||
std::int32_t eport) const {
|
||||
Json jcmd{Object{}};
|
||||
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kStart)};
|
||||
jcmd["port"] = Integer{lport};
|
||||
jcmd["error_port"] = Integer{eport};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = tracker->Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Failed to send init command from worker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
|
||||
std::string scmd;
|
||||
auto n_bytes = tracker->Recv(&scmd);
|
||||
if (n_bytes <= 0) {
|
||||
return Fail("Failed to recv init command from tracker.");
|
||||
}
|
||||
auto jcmd = Json::Load(scmd);
|
||||
auto world = get<Integer const>(jcmd["world_size"]);
|
||||
if (world <= 0) {
|
||||
return Fail("Invalid world size.");
|
||||
}
|
||||
*p_world = world;
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
|
||||
std::int32_t* p_port, TCPSocket* p_sock,
|
||||
std::int32_t* eport) const {
|
||||
*p_port = get<Integer const>(jcmd["port"]);
|
||||
if (*p_port <= 0) {
|
||||
return Fail("Invalid port.");
|
||||
}
|
||||
if (*recv_world != -1) {
|
||||
return Fail("Invalid initialization sequence.");
|
||||
}
|
||||
*recv_world = world;
|
||||
*eport = get<Integer const>(jcmd["error_port"]);
|
||||
return TrackerSend(world, p_sock);
|
||||
}
|
||||
};
|
||||
|
||||
struct Print {
|
||||
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
|
||||
Json jcmd{Object{}};
|
||||
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kPrint)};
|
||||
jcmd["msg"] = String{std::move(msg)};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = tracker->Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Failed to send print command from worker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg) const {
|
||||
if (!IsA<String>(jcmd["msg"])) {
|
||||
return Fail("Invalid print command.");
|
||||
}
|
||||
auto msg = get<String const>(jcmd["msg"]);
|
||||
*p_msg = msg;
|
||||
return Success();
|
||||
}
|
||||
};
|
||||
|
||||
struct ErrorCMD {
|
||||
[[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
|
||||
auto msg = res.Report();
|
||||
auto code = res.Code().value();
|
||||
Json jcmd{Object{}};
|
||||
jcmd["msg"] = String{std::move(msg)};
|
||||
jcmd["code"] = Integer{code};
|
||||
jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kError)};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = tracker->Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Failed to send error command from worker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
[[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg, int* p_code) const {
|
||||
if (!IsA<String>(jcmd["msg"]) || !IsA<Integer>(jcmd["code"])) {
|
||||
return Fail("Invalid error command.");
|
||||
}
|
||||
auto msg = get<String const>(jcmd["msg"]);
|
||||
auto code = get<Integer const>(jcmd["code"]);
|
||||
*p_msg = msg;
|
||||
*p_code = code;
|
||||
return Success();
|
||||
}
|
||||
};
|
||||
|
||||
struct ShutdownCMD {
|
||||
[[nodiscard]] Result Send(TCPSocket* peer) const {
|
||||
Json jcmd{Object{}};
|
||||
jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
|
||||
auto scmd = Json::Dump(jcmd);
|
||||
auto n_bytes = peer->Send(scmd);
|
||||
if (n_bytes != scmd.size()) {
|
||||
return Fail("Failed to send shutdown command from worker.");
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
};
|
||||
} // namespace xgboost::collective::proto
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "communicator-inl.h"
|
||||
#include "communicator.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
@@ -55,10 +56,29 @@ class RabitCommunicator : public Communicator {
|
||||
|
||||
bool IsFederated() const override { return false; }
|
||||
|
||||
void AllGather(void *send_receive_buffer, std::size_t size) override {
|
||||
auto const per_rank = size / GetWorldSize();
|
||||
std::string AllGather(std::string_view input) override {
|
||||
auto const per_rank = input.size();
|
||||
auto const total_size = per_rank * GetWorldSize();
|
||||
auto const index = per_rank * GetRank();
|
||||
rabit::Allgather(static_cast<char *>(send_receive_buffer), size, index, per_rank, per_rank);
|
||||
std::string result(total_size, '\0');
|
||||
result.replace(index, per_rank, input);
|
||||
rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string AllGatherV(std::string_view input) override {
|
||||
auto const size_node_slice = input.size();
|
||||
auto const all_sizes = collective::Allgather(size_node_slice);
|
||||
auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
|
||||
auto const begin_index =
|
||||
std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
|
||||
auto const size_prev_slice =
|
||||
GetRank() == 0 ? all_sizes[GetWorldSize() - 1] : all_sizes[GetRank() - 1];
|
||||
|
||||
std::string result(total_size, '\0');
|
||||
result.replace(begin_index, size_node_slice, input);
|
||||
rabit::Allgather(result.data(), total_size, begin_index, size_node_slice, size_prev_slice);
|
||||
return result;
|
||||
}
|
||||
|
||||
void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
*/
|
||||
#include "xgboost/collective/socket.h"
|
||||
|
||||
#include <array> // for array
|
||||
#include <cstddef> // std::size_t
|
||||
#include <cstdint> // std::int32_t
|
||||
#include <cstring> // std::memcpy, std::memset
|
||||
@@ -92,13 +93,18 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
|
||||
|
||||
conn = TCPSocket::Create(addr.Domain());
|
||||
CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
|
||||
conn.SetNonBlock(true);
|
||||
auto non_blocking = conn.NonBlocking();
|
||||
auto rc = conn.NonBlocking(true);
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to set socket option.", std::move(rc));
|
||||
}
|
||||
|
||||
Result last_error;
|
||||
auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
|
||||
auto log_failure = [&host, &last_error, port](Result err, char const *file, std::int32_t line) {
|
||||
last_error = std::move(err);
|
||||
LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
|
||||
<< "): Failed to connect to:" << host << " Error:" << last_error.Report();
|
||||
<< "): Failed to connect to:" << host << ":" << port
|
||||
<< " Error:" << last_error.Report();
|
||||
};
|
||||
|
||||
for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
|
||||
@@ -112,39 +118,42 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
|
||||
}
|
||||
|
||||
auto rc = connect(conn.Handle(), addr_handle, addr_len);
|
||||
if (rc != 0) {
|
||||
auto errcode = system::LastError();
|
||||
if (!system::ErrorWouldBlock(errcode)) {
|
||||
log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
|
||||
__FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
|
||||
rabit::utils::PollHelper poll;
|
||||
poll.WatchWrite(conn);
|
||||
auto result = poll.Poll(timeout);
|
||||
if (!result.OK()) {
|
||||
log_failure(std::move(result), __FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
if (!poll.CheckWrite(conn)) {
|
||||
log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
|
||||
__FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
result = conn.GetSockError();
|
||||
if (!result.OK()) {
|
||||
log_failure(std::move(result), __FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
|
||||
conn.SetNonBlock(false);
|
||||
return Success();
|
||||
|
||||
} else {
|
||||
conn.SetNonBlock(false);
|
||||
return Success();
|
||||
if (rc == 0) {
|
||||
return conn.NonBlocking(non_blocking);
|
||||
}
|
||||
|
||||
auto errcode = system::LastError();
|
||||
if (!system::ErrorWouldBlock(errcode)) {
|
||||
log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
|
||||
__FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
|
||||
rabit::utils::PollHelper poll;
|
||||
poll.WatchWrite(conn);
|
||||
auto result = poll.Poll(timeout);
|
||||
if (!result.OK()) {
|
||||
// poll would fail if there's a socket error, we log the root cause instead of the
|
||||
// poll failure.
|
||||
auto sockerr = conn.GetSockError();
|
||||
if (!sockerr.OK()) {
|
||||
result = std::move(sockerr);
|
||||
}
|
||||
log_failure(std::move(result), __FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
if (!poll.CheckWrite(conn)) {
|
||||
log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}), __FILE__,
|
||||
__LINE__);
|
||||
continue;
|
||||
}
|
||||
result = conn.GetSockError();
|
||||
if (!result.OK()) {
|
||||
log_failure(std::move(result), __FILE__, __LINE__);
|
||||
continue;
|
||||
}
|
||||
|
||||
return conn.NonBlocking(non_blocking);
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
@@ -152,4 +161,13 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
|
||||
conn.Close();
|
||||
return Fail(ss.str(), std::move(last_error));
|
||||
}
|
||||
|
||||
[[nodiscard]] Result GetHostName(std::string *p_out) {
|
||||
std::array<char, HOST_NAME_MAX> buf;
|
||||
if (gethostname(&buf[0], HOST_NAME_MAX) != 0) {
|
||||
return system::FailWithCode("Failed to get host name.");
|
||||
}
|
||||
*p_out = buf.data();
|
||||
return Success();
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
296
src/collective/tracker.cc
Normal file
296
src/collective/tracker.cc
Normal file
@@ -0,0 +1,296 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#if defined(__unix__) || defined(__APPLE__)
|
||||
#include <netdb.h> // gethostbyname
|
||||
#include <sys/socket.h> // socket, AF_INET6, AF_INET, connect, getsockname
|
||||
#endif // defined(__unix__) || defined(__APPLE__)
|
||||
|
||||
#if !defined(NOMINMAX) && defined(_WIN32)
|
||||
#define NOMINMAX
|
||||
#endif // !defined(NOMINMAX)
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <winsock2.h>
|
||||
#include <ws2tcpip.h>
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
#include <algorithm> // for sort
|
||||
#include <chrono> // for seconds
|
||||
#include <cstdint> // for int32_t
|
||||
#include <string> // for string
|
||||
#include <utility> // for move, forward
|
||||
|
||||
#include "../common/json_utils.h"
|
||||
#include "comm.h"
|
||||
#include "protocol.h" // for kMagic, PeerInfo
|
||||
#include "tracker.h"
|
||||
#include "xgboost/collective/result.h" // for Result, Fail, Success
|
||||
#include "xgboost/collective/socket.h" // for GetHostName, FailWithCode, MakeSockAddress, ...
|
||||
#include "xgboost/json.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
Tracker::Tracker(Json const& config)
|
||||
: n_workers_{static_cast<std::int32_t>(
|
||||
RequiredArg<Integer const>(config, "n_workers", __func__))},
|
||||
port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
|
||||
timeout_{std::chrono::seconds{OptionalArg<Integer const>(
|
||||
config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
|
||||
|
||||
RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
|
||||
: sock_{std::move(sock)} {
|
||||
auto host = addr.Addr();
|
||||
|
||||
std::int32_t rank{0};
|
||||
rc_ = Success()
|
||||
<< [&] { return proto::Magic{}.Verify(&sock_); }
|
||||
<< [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
|
||||
if (!rc_.OK()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string cmd;
|
||||
sock_.Recv(&cmd);
|
||||
auto jcmd = Json::Load(StringView{cmd});
|
||||
cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
|
||||
std::int32_t port{0};
|
||||
if (cmd_ == proto::CMD::kStart) {
|
||||
proto::Start start;
|
||||
rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
|
||||
} else if (cmd_ == proto::CMD::kPrint) {
|
||||
proto::Print print;
|
||||
rc_ = print.TrackerHandle(jcmd, &msg_);
|
||||
} else if (cmd_ == proto::CMD::kError) {
|
||||
proto::ErrorCMD error;
|
||||
rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
|
||||
}
|
||||
if (!rc_.OK()) {
|
||||
return;
|
||||
}
|
||||
|
||||
info_ = proto::PeerInfo{host, port, rank};
|
||||
}
|
||||
|
||||
RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
|
||||
std::string self;
|
||||
auto rc = collective::GetHostAddress(&self);
|
||||
auto host = OptionalArg<String>(config, "host", self);
|
||||
|
||||
listener_ = TCPSocket::Create(SockDomain::kV4);
|
||||
rc = listener_.Bind(host, &this->port_);
|
||||
CHECK(rc.OK()) << rc.Report();
|
||||
listener_.Listen();
|
||||
}
|
||||
|
||||
Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
|
||||
auto& workers = *p_workers;
|
||||
|
||||
std::sort(workers.begin(), workers.end(), WorkerCmp{});
|
||||
|
||||
std::vector<std::thread> bootstrap_threads;
|
||||
for (std::int32_t r = 0; r < n_workers_; ++r) {
|
||||
auto& worker = workers[r];
|
||||
auto next = BootstrapNext(r, n_workers_);
|
||||
auto const& next_w = workers[next];
|
||||
bootstrap_threads.emplace_back([next, &worker, &next_w] {
|
||||
auto jnext = proto::PeerInfo{next_w.Host(), next_w.Port(), next}.ToJson();
|
||||
std::string str;
|
||||
Json::Dump(jnext, &str);
|
||||
worker.Send(StringView{str});
|
||||
});
|
||||
}
|
||||
|
||||
for (auto& t : bootstrap_threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
for (auto const& w : workers) {
|
||||
worker_error_handles_.emplace_back(w.Host(), w.ErrorPort());
|
||||
}
|
||||
return Success();
|
||||
}
|
||||
|
||||
[[nodiscard]] std::future<Result> RabitTracker::Run() {
|
||||
// a state machine to keep track of consistency.
|
||||
struct State {
|
||||
std::int32_t const n_workers;
|
||||
|
||||
std::int32_t n_shutdown{0};
|
||||
bool during_restart{false};
|
||||
std::vector<WorkerProxy> pending;
|
||||
|
||||
explicit State(std::int32_t world) : n_workers{world} {}
|
||||
State(State const& that) = delete;
|
||||
State& operator=(State&& that) = delete;
|
||||
|
||||
void Start(WorkerProxy&& worker) {
|
||||
CHECK_LT(pending.size(), n_workers);
|
||||
CHECK_LE(n_shutdown, n_workers);
|
||||
|
||||
pending.emplace_back(std::forward<WorkerProxy>(worker));
|
||||
|
||||
CHECK_LE(pending.size(), n_workers);
|
||||
}
|
||||
void Shutdown() {
|
||||
CHECK_GE(n_shutdown, 0);
|
||||
CHECK_LT(n_shutdown, n_workers);
|
||||
|
||||
++n_shutdown;
|
||||
|
||||
CHECK_LE(n_shutdown, n_workers);
|
||||
}
|
||||
void Error() {
|
||||
CHECK_LE(pending.size(), n_workers);
|
||||
CHECK_LE(n_shutdown, n_workers);
|
||||
|
||||
during_restart = true;
|
||||
}
|
||||
[[nodiscard]] bool Ready() const {
|
||||
CHECK_LE(pending.size(), n_workers);
|
||||
return static_cast<std::int32_t>(pending.size()) == n_workers;
|
||||
}
|
||||
void Bootstrap() {
|
||||
CHECK_EQ(pending.size(), n_workers);
|
||||
CHECK_LE(n_shutdown, n_workers);
|
||||
|
||||
// A reset.
|
||||
n_shutdown = 0;
|
||||
during_restart = false;
|
||||
pending.clear();
|
||||
}
|
||||
[[nodiscard]] bool ShouldContinue() const {
|
||||
CHECK_LE(pending.size(), n_workers);
|
||||
CHECK_LE(n_shutdown, n_workers);
|
||||
// - Without error, we should shutdown after all workers are offline.
|
||||
// - With error, all workers are offline, and we have during_restart as true.
|
||||
return n_shutdown != n_workers || during_restart;
|
||||
}
|
||||
};
|
||||
|
||||
return std::async(std::launch::async, [this] {
|
||||
State state{this->n_workers_};
|
||||
|
||||
while (state.ShouldContinue()) {
|
||||
TCPSocket sock;
|
||||
SockAddrV4 addr;
|
||||
auto rc = listener_.Accept(&sock, &addr);
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to accept connection.", std::move(rc));
|
||||
}
|
||||
|
||||
auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
|
||||
if (!worker.Status().OK()) {
|
||||
return Fail("Failed to initialize worker proxy.", std::move(worker.Status()));
|
||||
}
|
||||
switch (worker.Command()) {
|
||||
case proto::CMD::kStart: {
|
||||
state.Start(std::move(worker));
|
||||
if (state.Ready()) {
|
||||
rc = this->Bootstrap(&state.pending);
|
||||
state.Bootstrap();
|
||||
}
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
case proto::CMD::kShutdown: {
|
||||
state.Shutdown();
|
||||
continue;
|
||||
}
|
||||
case proto::CMD::kError: {
|
||||
if (state.during_restart) {
|
||||
continue;
|
||||
}
|
||||
state.Error();
|
||||
auto msg = worker.Msg();
|
||||
auto code = worker.Code();
|
||||
LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
|
||||
<< "]: " << msg << " code:" << code;
|
||||
auto host = worker.Host();
|
||||
// We signal all workers for the error, if they haven't aborted already.
|
||||
for (auto& w : worker_error_handles_) {
|
||||
if (w.first == host) {
|
||||
continue;
|
||||
}
|
||||
TCPSocket out;
|
||||
// retry is set to 1, just let the worker timeout or error. Otherwise the
|
||||
// tracker and the worker might be waiting for each other.
|
||||
auto rc = Connect(w.first, w.second, 1, timeout_, &out);
|
||||
// send signal to stop the worker.
|
||||
proto::ShutdownCMD shutdown;
|
||||
rc = shutdown.Send(&out);
|
||||
if (!rc.OK()) {
|
||||
return Fail("Failed to inform workers to stop.");
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
case proto::CMD::kPrint: {
|
||||
LOG(CONSOLE) << worker.Msg();
|
||||
continue;
|
||||
}
|
||||
case proto::CMD::kInvalid:
|
||||
default: {
|
||||
return Fail("Invalid command received.");
|
||||
}
|
||||
}
|
||||
}
|
||||
return Success();
|
||||
});
|
||||
}
|
||||
|
||||
[[nodiscard]] Result GetHostAddress(std::string* out) {
|
||||
auto rc = GetHostName(out);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
auto host = gethostbyname(out->c_str());
|
||||
|
||||
// get ip address from host
|
||||
std::string ip;
|
||||
rc = INetNToP(host, &ip);
|
||||
if (!rc.OK()) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
|
||||
// return if this is a public IP address.
|
||||
// not entirely accurate, we have other reserved IPs
|
||||
*out = ip;
|
||||
return Success();
|
||||
}
|
||||
|
||||
// Create an UDP socket to prob the public IP address, it's fine even if it's
|
||||
// unreachable.
|
||||
auto sock = socket(AF_INET, SOCK_DGRAM, 0);
|
||||
if (sock == -1) {
|
||||
return Fail("Failed to create socket.");
|
||||
}
|
||||
|
||||
auto paddr = MakeSockAddress(StringView{"10.255.255.255"}, 1);
|
||||
sockaddr const* addr_handle = reinterpret_cast<const sockaddr*>(&paddr.V4().Handle());
|
||||
socklen_t addr_len{sizeof(paddr.V4().Handle())};
|
||||
auto err = connect(sock, addr_handle, addr_len);
|
||||
if (err != 0) {
|
||||
return system::FailWithCode("Failed to find IP address.");
|
||||
}
|
||||
|
||||
// get the IP address from socket desrciptor
|
||||
struct sockaddr_in addr;
|
||||
socklen_t len = sizeof(addr);
|
||||
if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
|
||||
return Fail("Failed to get sock name.");
|
||||
}
|
||||
ip = inet_ntoa(addr.sin_addr);
|
||||
|
||||
err = system::CloseSocket(sock);
|
||||
if (err != 0) {
|
||||
return system::FailWithCode("Failed to close socket.");
|
||||
}
|
||||
|
||||
*out = ip;
|
||||
return Success();
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
141
src/collective/tracker.h
Normal file
141
src/collective/tracker.h
Normal file
@@ -0,0 +1,141 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <chrono> // for seconds
|
||||
#include <cstdint> // for int32_t
|
||||
#include <future> // for future
|
||||
#include <string> // for string
|
||||
#include <utility> // for pair
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "protocol.h"
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
#include "xgboost/collective/socket.h" // for TCPSocket
|
||||
#include "xgboost/json.h" // for Json
|
||||
|
||||
namespace xgboost::collective {
|
||||
/**
|
||||
*
|
||||
* @brief Implementation of RABIT tracker.
|
||||
*
|
||||
* * What is a tracker
|
||||
*
|
||||
* The implementation of collective follows what RABIT did in the past. It requires a
|
||||
* tracker to coordinate initialization and error recovery of workers. While the
|
||||
* original implementation attempted to attain error resislient inside the collective
|
||||
* module, which turned out be too challenging due to large amount of external
|
||||
* states. The new implementation here differs from RABIT in the way that neither state
|
||||
* recovery nor resislient is handled inside the collective, it merely provides the
|
||||
* mechanism to signal error to other workers through the use of a centralized tracker.
|
||||
*
|
||||
* There are three major functionalities provided the a tracker, namely:
|
||||
* - Initialization. Share the node addresses among all workers.
|
||||
* - Logging.
|
||||
* - Signal error. If an exception is thrown in one (or many) of the workers, it can
|
||||
* signal an error to the tracker and the tracker will notify other workers.
|
||||
*/
|
||||
class Tracker {
|
||||
protected:
|
||||
std::int32_t n_workers_{0};
|
||||
std::int32_t port_{-1};
|
||||
std::chrono::seconds timeout_{0};
|
||||
|
||||
public:
|
||||
explicit Tracker(Json const& config);
|
||||
Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
|
||||
: n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
|
||||
|
||||
virtual ~Tracker() noexcept(false){}; // NOLINT
|
||||
[[nodiscard]] virtual std::future<Result> Run() = 0;
|
||||
[[nodiscard]] virtual Json WorkerArgs() const = 0;
|
||||
[[nodiscard]] std::chrono::seconds Timeout() const { return timeout_; }
|
||||
};
|
||||
|
||||
class RabitTracker : public Tracker {
|
||||
// a wrapper for connected worker socket.
|
||||
class WorkerProxy {
|
||||
TCPSocket sock_;
|
||||
proto::PeerInfo info_;
|
||||
std::int32_t eport_{0};
|
||||
std::int32_t world_{-1};
|
||||
std::string task_id_;
|
||||
|
||||
proto::CMD cmd_{proto::CMD::kInvalid};
|
||||
std::string msg_;
|
||||
std::int32_t code_{0};
|
||||
Result rc_;
|
||||
|
||||
public:
|
||||
explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
|
||||
WorkerProxy(WorkerProxy const& that) = delete;
|
||||
WorkerProxy(WorkerProxy&& that) = default;
|
||||
WorkerProxy& operator=(WorkerProxy const&) = delete;
|
||||
WorkerProxy& operator=(WorkerProxy&&) = default;
|
||||
|
||||
[[nodiscard]] auto Host() const { return info_.host; }
|
||||
[[nodiscard]] auto TaskID() const { return task_id_; }
|
||||
[[nodiscard]] auto Port() const { return info_.port; }
|
||||
[[nodiscard]] auto Rank() const { return info_.rank; }
|
||||
[[nodiscard]] auto ErrorPort() const { return eport_; }
|
||||
[[nodiscard]] auto Command() const { return cmd_; }
|
||||
[[nodiscard]] auto Msg() const { return msg_; }
|
||||
[[nodiscard]] auto Code() const { return code_; }
|
||||
|
||||
[[nodiscard]] Result const& Status() const { return rc_; }
|
||||
[[nodiscard]] Result& Status() { return rc_; }
|
||||
|
||||
void Send(StringView value) { this->sock_.Send(value); }
|
||||
};
|
||||
// provide an ordering for workers, this helps us get deterministic topology.
|
||||
struct WorkerCmp {
|
||||
[[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
|
||||
auto const& lh = lhs.Host();
|
||||
auto const& rh = rhs.Host();
|
||||
|
||||
if (lh != rh) {
|
||||
return lh < rh;
|
||||
}
|
||||
return lhs.TaskID() < rhs.TaskID();
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
std::string host_;
|
||||
// record for how to reach out to workers if error happens.
|
||||
std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
|
||||
// listening socket for incoming workers.
|
||||
TCPSocket listener_;
|
||||
|
||||
Result Bootstrap(std::vector<WorkerProxy>* p_workers);
|
||||
|
||||
public:
|
||||
explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
|
||||
std::chrono::seconds timeout)
|
||||
: Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
|
||||
listener_ = TCPSocket::Create(SockDomain::kV4);
|
||||
auto rc = listener_.Bind(host, &this->port_);
|
||||
CHECK(rc.OK()) << rc.Report();
|
||||
listener_.Listen();
|
||||
}
|
||||
|
||||
explicit RabitTracker(Json const& config);
|
||||
~RabitTracker() noexcept(false) override = default;
|
||||
|
||||
std::future<Result> Run() override;
|
||||
|
||||
[[nodiscard]] std::int32_t Port() const { return port_; }
|
||||
[[nodiscard]] Json WorkerArgs() const override {
|
||||
Json args{Object{}};
|
||||
args["DMLC_TRACKER_URI"] = String{host_};
|
||||
args["DMLC_TRACKER_PORT"] = this->Port();
|
||||
return args;
|
||||
}
|
||||
};
|
||||
|
||||
// Prob the public IP address of the host, need a better method.
|
||||
//
|
||||
// This is directly translated from the previous Python implementation, we should find a
|
||||
// more riguous approach, can use some expertise in network programming.
|
||||
[[nodiscard]] Result GetHostAddress(std::string* out);
|
||||
} // namespace xgboost::collective
|
||||
@@ -5,17 +5,16 @@
|
||||
#ifndef XGBOOST_COMMON_BITFIELD_H_
|
||||
#define XGBOOST_COMMON_BITFIELD_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <bitset>
|
||||
#include <cinttypes>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm> // for min
|
||||
#include <bitset> // for bitset
|
||||
#include <cstdint> // for uint32_t, uint64_t, uint8_t
|
||||
#include <ostream> // for ostream
|
||||
#include <type_traits> // for conditional_t, is_signed_v
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
|
||||
#include "device_helpers.cuh"
|
||||
#elif defined(__HIP_PLATFORM_AMD__)
|
||||
#include <thrust/copy.h>
|
||||
@@ -23,8 +22,8 @@
|
||||
#include "device_helpers.hip.h"
|
||||
#endif // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "common.h"
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -79,7 +78,7 @@ struct BitFieldContainer {
|
||||
private:
|
||||
value_type* bits_{nullptr};
|
||||
size_type n_values_{0};
|
||||
static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
|
||||
static_assert(!std::is_signed_v<VT>, "Must use an unsiged type as the underlying storage.");
|
||||
|
||||
public:
|
||||
XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
|
||||
@@ -244,11 +243,39 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
|
||||
|
||||
// Format: <Const><Direction>BitField<size of underlying type in bits>, underlying type
|
||||
// must be unsigned.
|
||||
using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
|
||||
using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
|
||||
using LBitField64 = BitFieldContainer<std::uint64_t, LBitsPolicy<std::uint64_t>>;
|
||||
using RBitField8 = BitFieldContainer<std::uint8_t, RBitsPolicy<unsigned char>>;
|
||||
|
||||
using LBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t>>;
|
||||
using CLBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t, true>, true>;
|
||||
using LBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t>>;
|
||||
using CLBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t, true>, true>;
|
||||
using RBitField32 = BitFieldContainer<std::uint32_t, RBitsPolicy<std::uint32_t>>;
|
||||
|
||||
namespace detail {
|
||||
inline std::uint32_t TrailingZeroBitsImpl(std::uint32_t value) {
|
||||
auto n = sizeof(value) * 8;
|
||||
std::uint32_t cnt{0};
|
||||
for (decltype(n) i = 0; i < n; i++) {
|
||||
if ((value >> i) & 1) {
|
||||
break;
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
|
||||
if (value == 0) {
|
||||
return sizeof(value) * 8;
|
||||
}
|
||||
#if defined(__GNUC__)
|
||||
return __builtin_ctz(value);
|
||||
#elif defined(_MSC_VER)
|
||||
return _tzcnt_u32(value);
|
||||
#else
|
||||
return detail::TrailingZeroBitsImpl(value);
|
||||
#endif // __GNUC__
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_COMMON_BITFIELD_H_
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
#ifndef XGBOOST_COMMON_COMMON_H_
|
||||
#define XGBOOST_COMMON_COMMON_H_
|
||||
|
||||
#include <algorithm> // for max
|
||||
#include <array> // for array
|
||||
#include <cmath> // for ceil
|
||||
#include <cstddef> // for size_t
|
||||
@@ -203,7 +202,7 @@ inline void SetDevice(std::int32_t device) {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Last index of a group in a CSR style of index pointer.
|
||||
* @brief Last index of a group in a CSR style of index pointer.
|
||||
*/
|
||||
template <typename Indexable>
|
||||
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
|
||||
|
||||
@@ -135,7 +135,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
|
||||
#endif
|
||||
}
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
|
||||
@@ -252,13 +252,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
|
||||
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
|
||||
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
|
||||
});
|
||||
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
|
||||
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
|
||||
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
|
||||
&column_sizes_scan);
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
|
||||
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
||||
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
||||
&column_sizes_scan);
|
||||
}
|
||||
|
||||
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
|
||||
|
||||
HistogramCuts cuts;
|
||||
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
|
||||
ctx->Ordinal());
|
||||
ctx->Device());
|
||||
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
|
||||
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
|
||||
std::size_t page_nnz = page.data.Size();
|
||||
|
||||
@@ -86,9 +86,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
||||
}
|
||||
|
||||
template <std::uint32_t kBlockThreads, typename Kernel>
|
||||
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
|
||||
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
|
||||
int n_mps = 0;
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
@@ -110,11 +110,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
|
||||
* \param out_column_size Output buffer for the size of each column.
|
||||
*/
|
||||
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
|
||||
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
|
||||
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
|
||||
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
|
||||
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
|
||||
|
||||
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
|
||||
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
|
||||
// Not strictly correct as we should use number of samples to determine the type of
|
||||
// counter. However, the sample size is not known due to sliding window on number of
|
||||
// elements.
|
||||
@@ -158,7 +158,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
|
||||
}
|
||||
|
||||
template <typename BatchIt>
|
||||
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
|
||||
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
|
||||
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
|
||||
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
|
||||
dh::caching_device_vector<size_t>* column_sizes_scan) {
|
||||
@@ -228,7 +228,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
|
||||
// Count the valid entries in each column and copy them out.
|
||||
template <typename AdapterBatch, typename BatchIter>
|
||||
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
|
||||
float missing, size_t columns, size_t cuts_per_feature, int device,
|
||||
float missing, size_t columns, size_t cuts_per_feature,
|
||||
DeviceOrd device,
|
||||
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
|
||||
dh::caching_device_vector<size_t>* column_sizes_scan,
|
||||
dh::device_vector<Entry>* sorted_entries) {
|
||||
@@ -252,7 +253,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
|
||||
void SortByWeight(dh::device_vector<float>* weights,
|
||||
dh::device_vector<Entry>* sorted_entries);
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
||||
@@ -290,7 +291,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
|
||||
|
||||
template <typename AdapterBatch>
|
||||
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
int device, size_t columns, size_t begin, size_t end,
|
||||
DeviceOrd device, size_t columns, size_t begin, size_t end,
|
||||
float missing, SketchContainer *sketch_container,
|
||||
int num_cuts) {
|
||||
// Copy current subset of valid elements into temporary storage and sort
|
||||
@@ -335,11 +336,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
template <typename Batch>
|
||||
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
int num_cuts_per_feature,
|
||||
bool is_ranking, float missing, int device,
|
||||
bool is_ranking, float missing, DeviceOrd device,
|
||||
size_t columns, size_t begin, size_t end,
|
||||
SketchContainer *sketch_container) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
info.weights_.SetDevice(device);
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
@@ -451,14 +452,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
size_t num_rows = batch.NumRows();
|
||||
size_t num_cols = batch.NumCols();
|
||||
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
|
||||
int32_t device = sketch_container->DeviceIdx();
|
||||
auto device = sketch_container->DeviceIdx();
|
||||
bool weighted = !info.weights_.Empty();
|
||||
|
||||
if (weighted) {
|
||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||
sketch_batch_num_elements,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, true);
|
||||
device.ordinal, num_cuts_per_feature, true);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
@@ -471,7 +472,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||
sketch_batch_num_elements,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, false);
|
||||
device.ordinal, num_cuts_per_feature, false);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
|
||||
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
@@ -81,7 +81,7 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
||||
DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
|
||||
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(int) const {}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
|
||||
|
||||
@@ -178,6 +175,7 @@ template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<GradientPairPrecise>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<int8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
|
||||
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
||||
template <typename T>
|
||||
class HostDeviceVectorImpl {
|
||||
public:
|
||||
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
|
||||
if (device.IsCUDA()) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_->resize(size, v);
|
||||
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
|
||||
|
||||
// Initializer can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Initializer>
|
||||
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
|
||||
if (device.IsCUDA()) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
LazyResizeDevice(init.size());
|
||||
Copy(init);
|
||||
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_{that.gpu_access_} {}
|
||||
|
||||
~HostDeviceVectorImpl() {
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
SetDevice();
|
||||
}
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
[[nodiscard]] size_t Size() const {
|
||||
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
|
||||
}
|
||||
|
||||
int DeviceIdx() const { return device_; }
|
||||
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||
|
||||
T* DevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
@@ -138,8 +138,7 @@ class HostDeviceVectorImpl {
|
||||
} else {
|
||||
auto ptr = other->ConstDevicePointer();
|
||||
SetDevice();
|
||||
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
|
||||
|
||||
CHECK_EQ(this->Device(), other->Device());
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
@@ -157,24 +156,25 @@ class HostDeviceVectorImpl {
|
||||
return data_h_;
|
||||
}
|
||||
|
||||
void SetDevice(int device) {
|
||||
void SetDevice(DeviceOrd device) {
|
||||
if (device_ == device) { return; }
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
}
|
||||
|
||||
if (device_ >= 0 && device >= 0) {
|
||||
CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
|
||||
if (device_.IsCUDA() && device.IsCUDA()) {
|
||||
CHECK_EQ(device_.ordinal, device.ordinal)
|
||||
<< "New device ordinal is different from previous one.";
|
||||
}
|
||||
device_ = device;
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
LazyResizeDevice(data_h_.size());
|
||||
}
|
||||
}
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size()) { return; }
|
||||
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
|
||||
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
||||
// fast on-device resize
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
@@ -221,16 +221,16 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = access;
|
||||
}
|
||||
|
||||
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
GPUAccess Access() const { return gpu_access_; }
|
||||
[[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||
[[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||
[[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||
[[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
[[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
[[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
[[nodiscard]] GPUAccess Access() const { return gpu_access_; }
|
||||
|
||||
private:
|
||||
int device_{-1};
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
std::vector<T> data_h_{};
|
||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||
@@ -264,11 +264,11 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
void SetDevice() {
|
||||
CHECK_GE(device_, 0);
|
||||
CHECK_GE(device_.ordinal, 0);
|
||||
if (cudaSetDeviceHandler == nullptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
} else {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
(*cudaSetDeviceHandler)(device_.ordinal);
|
||||
}
|
||||
|
||||
if (!data_d_) {
|
||||
@@ -278,15 +278,15 @@ class HostDeviceVectorImpl {
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||
|
||||
template <typename T>
|
||||
@@ -314,7 +314,9 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
||||
DeviceOrd HostDeviceVector<T>::Device() const {
|
||||
return impl_->Device();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer() {
|
||||
@@ -394,14 +396,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
||||
return impl_->Access();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(int device) const {
|
||||
impl_->SetDevice(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
||||
impl_->SetDevice(device.ordinal);
|
||||
impl_->SetDevice(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -416,6 +413,7 @@ template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<GradientPairPrecise>;
|
||||
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||
template class HostDeviceVector<uint8_t>;
|
||||
template class HostDeviceVector<int8_t>;
|
||||
template class HostDeviceVector<FeatureType>;
|
||||
template class HostDeviceVector<Entry>;
|
||||
template class HostDeviceVector<uint64_t>; // bst_row_t
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#define XGBOOST_COMMON_IO_H_
|
||||
|
||||
#include <dmlc/io.h>
|
||||
#include <rabit/rabit.h>
|
||||
#include <rabit/internal/io.h> // for MemoryFixSizeBuffer, MemoryBufferStream
|
||||
|
||||
#include <algorithm> // for min, fill_n, copy_n
|
||||
#include <array> // for array
|
||||
@@ -382,7 +382,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
|
||||
* @param length See the `length` parameter of `mmap` for details.
|
||||
*/
|
||||
explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
|
||||
: AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
|
||||
: AlignedResourceReadStream{std::shared_ptr<MmapResource>{ // NOLINT
|
||||
new MmapResource{std::move(path), offset, length}}} {}
|
||||
~PrivateMmapConstStream() noexcept(false) override;
|
||||
};
|
||||
|
||||
|
||||
74
src/common/json_utils.h
Normal file
74
src/common/json_utils.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
*
|
||||
* @brief Utils tailored for XGBoost.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <string> // for string
|
||||
#include <type_traits> // for enable_if_t, remove_const_t
|
||||
|
||||
#include "xgboost/json.h"
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
|
||||
namespace xgboost {
|
||||
namespace detail {
|
||||
template <typename Head>
|
||||
bool TypeCheckImpl(Json const &value) {
|
||||
return IsA<Head>(value);
|
||||
}
|
||||
|
||||
template <typename Head, typename... JT>
|
||||
std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const &value) {
|
||||
return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
|
||||
}
|
||||
|
||||
template <typename Head>
|
||||
std::string TypeCheckError() {
|
||||
return "`" + Head{}.TypeStr() + "`";
|
||||
}
|
||||
|
||||
template <typename Head, typename... JT>
|
||||
std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
|
||||
return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* @brief Type check for JSON-based parameters
|
||||
*
|
||||
* @tparam JT Expected JSON types.
|
||||
* @param value Value to be checked.
|
||||
*/
|
||||
template <typename... JT>
|
||||
void TypeCheck(Json const &value, StringView name) {
|
||||
if (!detail::TypeCheckImpl<JT...>(value)) {
|
||||
LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
|
||||
<< detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
|
||||
<< "`";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename JT>
|
||||
auto const &RequiredArg(Json const &in, StringView key, StringView func) {
|
||||
auto const &obj = get<Object const>(in);
|
||||
auto it = obj.find(key);
|
||||
if (it == obj.cend() || IsA<Null>(it->second)) {
|
||||
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
|
||||
}
|
||||
TypeCheck<JT>(it->second, StringView{key});
|
||||
return get<std::remove_const_t<JT> const>(it->second);
|
||||
}
|
||||
|
||||
template <typename JT, typename T>
|
||||
auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
|
||||
auto const &obj = get<Object const>(in);
|
||||
auto it = obj.find(key);
|
||||
if (it != obj.cend() && !IsA<Null>(it->second)) {
|
||||
TypeCheck<JT>(it->second, key);
|
||||
|
||||
return get<std::remove_const_t<JT> const>(it->second);
|
||||
}
|
||||
return dft;
|
||||
}
|
||||
} // namespace xgboost
|
||||
@@ -44,7 +44,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_
|
||||
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
|
||||
ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
|
||||
ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
|
||||
}
|
||||
} // namespace linalg
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr)
|
||||
|
||||
template <typename T, int32_t D, typename Fn>
|
||||
void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
|
||||
if (!ctx->IsCPU()) {
|
||||
if (ctx->IsCUDA()) {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
ElementWiseKernelHost(t, ctx->Threads(), fn);
|
||||
|
||||
@@ -11,13 +11,14 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
if (ctx->IsCPU()) {
|
||||
if (ctx->IsCUDA()) {
|
||||
return cuda_impl::Reduce(ctx, values);
|
||||
} else {
|
||||
auto const& h_values = values.ConstHostVector();
|
||||
auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
|
||||
static_assert(std::is_same<decltype(result), double>::value);
|
||||
return result;
|
||||
}
|
||||
return cuda_impl::Reduce(ctx, values);
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -8,11 +8,9 @@
|
||||
#include "xgboost/context.h" // Context
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace cuda_impl {
|
||||
namespace xgboost::common::cuda_impl {
|
||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
values.SetDevice(ctx->gpu_id);
|
||||
values.SetDevice(ctx->Device());
|
||||
auto const d_values = values.ConstDeviceSpan();
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
@@ -24,6 +22,4 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
thrust::plus<float>{});
|
||||
#endif
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common::cuda_impl
|
||||
|
||||
@@ -24,9 +24,9 @@ struct OptionalWeights {
|
||||
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
||||
HostDeviceVector<float> const& weights) {
|
||||
if (ctx->IsCUDA()) {
|
||||
weights.SetDevice(ctx->gpu_id);
|
||||
weights.SetDevice(ctx->Device());
|
||||
}
|
||||
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
|
||||
return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
#endif // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
|
||||
|
||||
@@ -242,11 +242,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
||||
// summary does the output element come from) result by definition of merged rank. So we
|
||||
// run it in 2 passes to obtain the merge path and then customize the standard merge
|
||||
// algorithm.
|
||||
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
|
||||
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
||||
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
||||
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
||||
CHECK_EQ(y_ptr.size(), out_ptr.size());
|
||||
@@ -344,8 +343,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr,
|
||||
size_t total_cuts, Span<float> weights) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
bool first_window = this->Current().empty();
|
||||
@@ -404,7 +402,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
* pruning or merging. We preserve the first type and remove the second type.
|
||||
*/
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
@@ -461,7 +459,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
|
||||
void SketchContainer::Prune(size_t to) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
|
||||
OffsetT to_total = 0;
|
||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||
@@ -496,8 +494,7 @@ void SketchContainer::Prune(size_t to) {
|
||||
|
||||
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
Span<SketchEntry const> that) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
timer_.Start(__func__);
|
||||
if (this->Current().size() == 0) {
|
||||
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
|
||||
@@ -532,8 +529,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
}
|
||||
|
||||
void SketchContainer::FixError() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||
auto in = dh::ToSpan(this->Current());
|
||||
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
|
||||
@@ -558,7 +554,7 @@ void SketchContainer::FixError() {
|
||||
}
|
||||
|
||||
void SketchContainer::AllReduce(bool is_column_split) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1 || is_column_split) {
|
||||
return;
|
||||
@@ -585,15 +581,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
|
||||
auto offset = rank * d_columns_ptr.size();
|
||||
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
|
||||
gathered_ptrs.begin() + offset);
|
||||
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
|
||||
collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
|
||||
gathered_ptrs.size());
|
||||
|
||||
// Get the data from all workers.
|
||||
std::vector<size_t> recv_lengths;
|
||||
dh::caching_device_vector<char> recvbuf;
|
||||
collective::AllGatherV(device_, this->Current().data().get(),
|
||||
collective::AllGatherV(device_.ordinal, this->Current().data().get(),
|
||||
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
|
||||
collective::Synchronize(device_);
|
||||
collective::Synchronize(device_.ordinal);
|
||||
|
||||
// Segment the received data.
|
||||
auto s_recvbuf = dh::ToSpan(recvbuf);
|
||||
@@ -640,7 +636,7 @@ struct InvalidCatOp {
|
||||
|
||||
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
p_cuts->min_vals_.Resize(num_columns_);
|
||||
|
||||
// Sync between workers.
|
||||
@@ -690,21 +686,41 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||
});
|
||||
CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
|
||||
max_values.resize(d_in_columns_ptr.size() - 1);
|
||||
|
||||
// In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep
|
||||
// track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
|
||||
dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
|
||||
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
|
||||
thrust::make_discard_iterator(), d_max_values.begin(),
|
||||
thrust::equal_to<bst_feature_t>{},
|
||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
|
||||
thrust::make_discard_iterator(), d_max_values.begin(),
|
||||
thrust::equal_to<bst_feature_t>{},
|
||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
||||
#endif
|
||||
auto new_end = thrust::reduce_by_key(
|
||||
thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
|
||||
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
|
||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
||||
d_max_keys.erase(new_end.first, d_max_keys.end());
|
||||
d_max_values.erase(new_end.second, d_max_values.end());
|
||||
|
||||
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
|
||||
// The device vector needs to be initialized explicitly since we may have some missing columns.
|
||||
SketchEntry default_entry{};
|
||||
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
|
||||
default_entry);
|
||||
thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
|
||||
d_max_keys.begin(), d_max_results.begin());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
auto new_end = thrust::reduce_by_key(
|
||||
thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
|
||||
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
|
||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
||||
d_max_keys.erase(new_end.first, d_max_keys.end());
|
||||
d_max_values.erase(new_end.second, d_max_values.end());
|
||||
|
||||
// The device vector needs to be initialized explicitly since we may have some missing columns.
|
||||
SketchEntry default_entry{};
|
||||
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
|
||||
default_entry);
|
||||
thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(),
|
||||
d_max_keys.begin(), d_max_results.begin());
|
||||
#endif
|
||||
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
|
||||
auto max_it = MakeIndexTransformIter([&](auto i) {
|
||||
if (IsCat(h_feature_types, i)) {
|
||||
return max_values[i].value;
|
||||
|
||||
@@ -41,7 +41,7 @@ class SketchContainer {
|
||||
bst_row_t num_rows_;
|
||||
bst_feature_t num_columns_;
|
||||
int32_t num_bins_;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
|
||||
// Double buffer as neither prune nor merge can be performed inplace.
|
||||
dh::device_vector<SketchEntry> entries_a_;
|
||||
@@ -93,35 +93,32 @@ class SketchContainer {
|
||||
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
||||
* \param device GPU ID.
|
||||
*/
|
||||
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
|
||||
int32_t max_bin, bst_feature_t num_columns,
|
||||
bst_row_t num_rows, int32_t device)
|
||||
: num_rows_{num_rows},
|
||||
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||
CHECK_GE(device, 0);
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(num_columns + 1);
|
||||
this->columns_ptr_b_.SetDevice(device_);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
|
||||
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
|
||||
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||
CHECK(device.IsCUDA());
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(num_columns + 1);
|
||||
this->columns_ptr_b_.SetDevice(device_);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||
|
||||
this->feature_types_.Resize(feature_types.Size());
|
||||
this->feature_types_.Copy(feature_types);
|
||||
// Pull to device.
|
||||
this->feature_types_.SetDevice(device);
|
||||
this->feature_types_.ConstDeviceSpan();
|
||||
this->feature_types_.ConstHostSpan();
|
||||
this->feature_types_.Resize(feature_types.Size());
|
||||
this->feature_types_.Copy(feature_types);
|
||||
// Pull to device.
|
||||
this->feature_types_.SetDevice(device);
|
||||
this->feature_types_.ConstDeviceSpan();
|
||||
this->feature_types_.ConstHostSpan();
|
||||
|
||||
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
||||
has_categorical_ =
|
||||
!d_feature_types.empty() &&
|
||||
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
|
||||
common::IsCatOp{});
|
||||
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
||||
has_categorical_ =
|
||||
!d_feature_types.empty() &&
|
||||
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
|
||||
|
||||
timer_.Init(__func__);
|
||||
}
|
||||
timer_.Init(__func__);
|
||||
}
|
||||
/* \brief Return GPU ID for this container. */
|
||||
int32_t DeviceIdx() const { return device_; }
|
||||
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
|
||||
/* \brief Whether the predictor matrix contains categorical features. */
|
||||
bool HasCategorical() const { return has_categorical_; }
|
||||
/* \brief Accumulate weights of duplicated entries in input. */
|
||||
@@ -175,9 +172,7 @@ class SketchContainer {
|
||||
template <typename KeyComp = thrust::equal_to<size_t>>
|
||||
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||
timer_.Start(__func__);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
||||
@@ -195,7 +190,7 @@ class SketchContainer {
|
||||
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
||||
entries.data() + entries.size(), scan_out.DevicePointer(),
|
||||
entries.data(), detail::SketchUnique{}, key_comp);
|
||||
#else
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
size_t n_uniques = dh::SegmentedUnique(
|
||||
thrust::cuda::par(alloc), d_column_scan.data(),
|
||||
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
||||
|
||||
@@ -35,13 +35,13 @@ struct WQSummary {
|
||||
/*! \brief an entry in the sketch summary */
|
||||
struct Entry {
|
||||
/*! \brief minimum rank */
|
||||
RType rmin;
|
||||
RType rmin{};
|
||||
/*! \brief maximum rank */
|
||||
RType rmax;
|
||||
RType rmax{};
|
||||
/*! \brief maximum weight */
|
||||
RType wmin;
|
||||
RType wmin{};
|
||||
/*! \brief the value of data */
|
||||
DType value;
|
||||
DType value{};
|
||||
// constructor
|
||||
XGBOOST_DEVICE Entry() {} // NOLINT
|
||||
// constructor
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
/**
|
||||
* Copyright 2023 by XGBoost contributors
|
||||
* Copyright 2023, XGBoost contributors
|
||||
*/
|
||||
#include "quantile_loss_utils.h"
|
||||
|
||||
#include <cctype> // std::isspace
|
||||
#include <istream> // std::istream
|
||||
#include <ostream> // std::ostream
|
||||
#include <string> // std::string
|
||||
#include <vector> // std::vector
|
||||
#include <cctype> // for isspace
|
||||
#include <istream> // for istream
|
||||
#include <ostream> // for ostream
|
||||
#include <string> // for string
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "xgboost/json.h" // F32Array,TypeCheck,get,Number
|
||||
#include "xgboost/json_io.h" // JsonWriter
|
||||
#include "../common/json_utils.h" // for TypeCheck
|
||||
#include "xgboost/json.h" // for F32Array, get, Number
|
||||
#include "xgboost/json_io.h" // for JsonWriter
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
|
||||
auto const& t = array.Get();
|
||||
xgboost::F32Array arr{t.size()};
|
||||
@@ -70,5 +70,4 @@ std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
|
||||
}
|
||||
|
||||
DMLC_REGISTER_PARAMETER(QuantileLossParam);
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
|
||||
@@ -197,10 +197,10 @@ class RankingCache {
|
||||
CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
|
||||
<< error::GroupSize() << "the size of label.";
|
||||
}
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
if (ctx->IsCUDA()) {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
} else {
|
||||
this->InitOnCPU(ctx, info);
|
||||
}
|
||||
if (!info.weights_.Empty()) {
|
||||
CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
|
||||
@@ -218,7 +218,7 @@ class RankingCache {
|
||||
// Constructed as [1, n_samples] if group ptr is not supplied by the user
|
||||
common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
|
||||
group_ptr_.SetDevice(ctx->Device());
|
||||
return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
|
||||
return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan();
|
||||
}
|
||||
|
||||
[[nodiscard]] auto const& Param() const { return param_; }
|
||||
@@ -231,10 +231,10 @@ class RankingCache {
|
||||
sorted_idx_cache_.SetDevice(ctx->Device());
|
||||
sorted_idx_cache_.Resize(predt.size());
|
||||
}
|
||||
if (ctx->IsCPU()) {
|
||||
return this->MakeRankOnCPU(ctx, predt);
|
||||
} else {
|
||||
if (ctx->IsCUDA()) {
|
||||
return this->MakeRankOnCUDA(ctx, predt);
|
||||
} else {
|
||||
return this->MakeRankOnCPU(ctx, predt);
|
||||
}
|
||||
}
|
||||
// The function simply returns a uninitialized buffer as this is only used by the
|
||||
@@ -307,10 +307,10 @@ class NDCGCache : public RankingCache {
|
||||
public:
|
||||
NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
if (ctx->IsCUDA()) {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
} else {
|
||||
this->InitOnCPU(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +318,7 @@ class NDCGCache : public RankingCache {
|
||||
return inv_idcg_.View(ctx->Device());
|
||||
}
|
||||
common::Span<double const> Discount(Context const* ctx) const {
|
||||
return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
|
||||
return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan();
|
||||
}
|
||||
linalg::VectorView<double> Dcg(Context const* ctx) {
|
||||
if (dcg_.Size() == 0) {
|
||||
@@ -387,10 +387,10 @@ class PreCache : public RankingCache {
|
||||
public:
|
||||
PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
if (ctx->IsCUDA()) {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
} else {
|
||||
this->InitOnCPU(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -399,7 +399,7 @@ class PreCache : public RankingCache {
|
||||
pre_.SetDevice(ctx->Device());
|
||||
pre_.Resize(this->Groups());
|
||||
}
|
||||
return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
|
||||
return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -418,10 +418,10 @@ class MAPCache : public RankingCache {
|
||||
public:
|
||||
MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
|
||||
: RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
|
||||
if (ctx->IsCPU()) {
|
||||
this->InitOnCPU(ctx, info);
|
||||
} else {
|
||||
if (ctx->IsCUDA()) {
|
||||
this->InitOnCUDA(ctx, info);
|
||||
} else {
|
||||
this->InitOnCPU(ctx, info);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,21 +430,21 @@ class MAPCache : public RankingCache {
|
||||
n_rel_.SetDevice(ctx->Device());
|
||||
n_rel_.Resize(n_samples_);
|
||||
}
|
||||
return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
|
||||
return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan();
|
||||
}
|
||||
common::Span<double> Acc(Context const* ctx) {
|
||||
if (acc_.Empty()) {
|
||||
acc_.SetDevice(ctx->Device());
|
||||
acc_.Resize(n_samples_);
|
||||
}
|
||||
return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
|
||||
return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan();
|
||||
}
|
||||
common::Span<double> Map(Context const* ctx) {
|
||||
if (map_.Empty()) {
|
||||
map_.SetDevice(ctx->Device());
|
||||
map_.Resize(this->Groups());
|
||||
}
|
||||
return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
|
||||
return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ class RefResourceView {
|
||||
|
||||
[[nodiscard]] size_type size() const { return size_; } // NOLINT
|
||||
[[nodiscard]] size_type size_bytes() const { // NOLINT
|
||||
return Span{data(), size()}.size_bytes();
|
||||
return Span<const value_type>{data(), size()}.size_bytes();
|
||||
}
|
||||
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
|
||||
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
|
||||
|
||||
@@ -15,8 +15,7 @@
|
||||
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
|
||||
if (!ctx->IsCPU()) {
|
||||
@@ -46,11 +45,13 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||
}
|
||||
|
||||
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
|
||||
v.SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
v.SetDevice(ctx->Device());
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(1);
|
||||
|
||||
if (ctx->IsCPU()) {
|
||||
if (ctx->IsCUDA()) {
|
||||
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
|
||||
} else {
|
||||
auto h_v = v.HostView();
|
||||
float n = v.Size();
|
||||
MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
|
||||
@@ -58,9 +59,6 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
|
||||
[&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
|
||||
auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
|
||||
out->HostView()(0) = ret;
|
||||
} else {
|
||||
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
|
||||
@@ -15,19 +15,16 @@
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace cuda_impl {
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
namespace cub = hipcub;
|
||||
#endif
|
||||
|
||||
namespace xgboost::common::cuda_impl {
|
||||
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
|
||||
CHECK_GE(t.Shape(1), 1);
|
||||
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
|
||||
segments.SetDevice(ctx->gpu_id);
|
||||
segments.SetDevice(ctx->Device());
|
||||
auto d_segments = segments.DeviceSpan();
|
||||
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
|
||||
@@ -36,7 +33,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
|
||||
});
|
||||
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(t.Shape(1));
|
||||
if (weights.Empty()) {
|
||||
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
|
||||
@@ -65,6 +62,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
|
||||
dh::TemporaryArray<char> temp{bytes};
|
||||
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common::cuda_impl
|
||||
|
||||
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
|
||||
auto d_sorted_idx = dh::ToSpan(sorted_idx);
|
||||
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
|
||||
|
||||
quantiles->SetDevice(ctx->gpu_id);
|
||||
quantiles->SetDevice(ctx->Device());
|
||||
quantiles->Resize(n_segments);
|
||||
auto d_results = quantiles->DeviceSpan();
|
||||
|
||||
@@ -226,7 +226,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
|
||||
#endif
|
||||
|
||||
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
||||
quantiles->SetDevice(ctx->gpu_id);
|
||||
quantiles->SetDevice(ctx->Device());
|
||||
quantiles->Resize(n_segments);
|
||||
auto d_results = quantiles->DeviceSpan();
|
||||
auto d_weight_cdf = dh::ToSpan(weights_cdf);
|
||||
|
||||
@@ -3,14 +3,23 @@
|
||||
*/
|
||||
#include "threading_utils.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <algorithm> // for max
|
||||
#include <exception> // for exception
|
||||
#include <filesystem> // for path, exists
|
||||
#include <fstream> // for ifstream
|
||||
#include <string> // for string
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include "common.h" // for DivRoundUp
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
int32_t GetCfsCPUCount() noexcept {
|
||||
namespace xgboost::common {
|
||||
/**
|
||||
* Modified from
|
||||
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
|
||||
*
|
||||
* MIT License: Copyright (c) 2016 Domagoj Šarić
|
||||
*/
|
||||
std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
|
||||
std::filesystem::path const& peroid_path) {
|
||||
#if defined(__linux__)
|
||||
// https://bugs.openjdk.java.net/browse/JDK-8146115
|
||||
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
|
||||
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
|
||||
}
|
||||
};
|
||||
// complete fair scheduler from Linux
|
||||
auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
|
||||
auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
|
||||
auto const cfs_quota(read_int(quota_path.c_str()));
|
||||
auto const cfs_period(read_int(peroid_path.c_str()));
|
||||
if ((cfs_quota > 0) && (cfs_period > 0)) {
|
||||
return std::max(cfs_quota / cfs_period, 1);
|
||||
}
|
||||
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
|
||||
std::int32_t cnt{-1};
|
||||
#if defined(__linux__)
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
std::int32_t a{0}, b{0};
|
||||
|
||||
auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
|
||||
try {
|
||||
std::ifstream fin{bandwidth_path, std::ios::in};
|
||||
fin >> a;
|
||||
fin >> b;
|
||||
} catch (std::exception const&) {
|
||||
warn();
|
||||
return cnt;
|
||||
}
|
||||
if (a > 0 && b > 0) {
|
||||
cnt = std::max(common::DivRoundUp(a, b), 1);
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
return cnt;
|
||||
}
|
||||
|
||||
std::int32_t GetCfsCPUCount() noexcept {
|
||||
namespace fs = std::filesystem;
|
||||
fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
|
||||
auto has_v2 = fs::exists(bandwidth_path);
|
||||
if (has_v2) {
|
||||
return GetCGroupV2Count(bandwidth_path);
|
||||
}
|
||||
|
||||
fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
|
||||
fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
|
||||
auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
|
||||
if (has_v1) {
|
||||
return GetCGroupV1Count(quota_path, peroid_path);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
|
||||
// Don't use parallel if we are in a parallel region.
|
||||
if (omp_in_parallel()) {
|
||||
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
|
||||
n_threads = std::max(n_threads, 1);
|
||||
return n_threads;
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
|
||||
@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
|
||||
* \brief Get thread limit from CFS.
|
||||
*
|
||||
* This function has non-trivial overhead and should not be called repeatly.
|
||||
*
|
||||
* Modified from
|
||||
* github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
|
||||
*
|
||||
* MIT License: Copyright (c) 2016 Domagoj Šarić
|
||||
*/
|
||||
std::int32_t GetCfsCPUCount() noexcept;
|
||||
|
||||
|
||||
@@ -62,8 +62,8 @@ class Transform {
|
||||
template <typename Functor>
|
||||
struct Evaluator {
|
||||
public:
|
||||
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
|
||||
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
|
||||
Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
|
||||
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
|
||||
|
||||
/*!
|
||||
* \brief Evaluate the functor with input pointers to HostDeviceVector.
|
||||
@@ -73,7 +73,7 @@ class Transform {
|
||||
*/
|
||||
template <typename... HDV>
|
||||
void Eval(HDV... vectors) const {
|
||||
bool on_device = device_ >= 0;
|
||||
bool on_device = device_.IsCUDA();
|
||||
|
||||
if (on_device) {
|
||||
LaunchCUDA(func_, vectors...);
|
||||
@@ -118,11 +118,11 @@ class Transform {
|
||||
}
|
||||
// Recursive unpack for Shard.
|
||||
template <typename T>
|
||||
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
|
||||
void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
|
||||
vector->SetDevice(device);
|
||||
}
|
||||
template <typename Head, typename... Rest>
|
||||
void UnpackShard(int device,
|
||||
void UnpackShard(DeviceOrd device,
|
||||
const HostDeviceVector<Head> *_vector,
|
||||
const HostDeviceVector<Rest> *... _vectors) const {
|
||||
_vector->SetDevice(device);
|
||||
@@ -142,13 +142,7 @@ class Transform {
|
||||
// granularity is used in data vector.
|
||||
size_t shard_size = range_size;
|
||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipSetDevice(device_));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
#endif
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
const int kGrids =
|
||||
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
if (kGrids == 0) {
|
||||
@@ -182,7 +176,7 @@ class Transform {
|
||||
/*! \brief Range object specifying parallel threads index range. */
|
||||
Range range_;
|
||||
int32_t n_threads_;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
};
|
||||
|
||||
public:
|
||||
@@ -200,8 +194,8 @@ class Transform {
|
||||
*/
|
||||
template <typename Functor>
|
||||
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
|
||||
int32_t device_idx) {
|
||||
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
|
||||
DeviceOrd device) {
|
||||
return Evaluator<Functor>{func, std::move(range), n_threads, device};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ namespace xgboost {
|
||||
|
||||
DMLC_REGISTER_PARAMETER(Context);
|
||||
|
||||
bst_d_ordinal_t constexpr Context::kCpuId;
|
||||
std::int64_t constexpr Context::kDefaultSeed;
|
||||
|
||||
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
|
||||
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::int32_t parsed_id{Context::kCpuId};
|
||||
std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
|
||||
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
|
||||
if (res.ec != std::errc()) {
|
||||
return std::nullopt;
|
||||
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
|
||||
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
|
||||
DeviceOrd device;
|
||||
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check.
|
||||
device.ordinal = DeviceOrd::InvalidOrdinal(); // mark it invalid for check.
|
||||
if (split_it == s_device.cend()) {
|
||||
// no ordinal.
|
||||
if (s_device == DeviceSym::CPU()) {
|
||||
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
device = DeviceOrd::CUDA(opt_id.value());
|
||||
}
|
||||
|
||||
if (device.ordinal < Context::kCpuId) {
|
||||
if (device.ordinal < DeviceOrd::CPUOrdinal()) {
|
||||
fatal();
|
||||
}
|
||||
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
|
||||
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
|
||||
os << ord.Name();
|
||||
return os;
|
||||
}
|
||||
|
||||
void Context::Init(Args const& kwargs) {
|
||||
auto unknown = this->UpdateAllowUnknown(kwargs);
|
||||
if (!unknown.empty()) {
|
||||
std::stringstream ss;
|
||||
std::size_t i = 0;
|
||||
ss << "[Internal Error] Unknown parameters passed to the Context {";
|
||||
for (auto const& [k, _] : unknown) {
|
||||
ss << '"' << k << '"';
|
||||
if (++i != unknown.size()) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "}\n";
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
}
|
||||
|
||||
void Context::ConfigureGpuId(bool require_gpu) {
|
||||
if (this->IsCPU() && require_gpu) {
|
||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
|
||||
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
|
||||
error::WarnDeprecatedGPUId();
|
||||
auto opt_id = ParseInt(StringView{gpu_id_it->second});
|
||||
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
|
||||
if (opt_id.value() > Context::kCpuId) {
|
||||
if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
|
||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
|
||||
} else {
|
||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
|
||||
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
|
||||
this->SetDevice(new_d);
|
||||
|
||||
if (this->IsCPU()) {
|
||||
CHECK_EQ(this->device_.ordinal, kCpuId);
|
||||
CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
|
||||
} else {
|
||||
CHECK_GT(this->device_.ordinal, kCpuId);
|
||||
CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
28
src/data/adapter.cc
Normal file
28
src/data/adapter.cc
Normal file
@@ -0,0 +1,28 @@
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
*/
|
||||
#include "adapter.h"
|
||||
|
||||
#include "../c_api/c_api_error.h" // for API_BEGIN, API_END
|
||||
#include "xgboost/c_api.h"
|
||||
|
||||
namespace xgboost::data {
|
||||
template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
|
||||
bool IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>::Next() {
|
||||
if ((*next_callback_)(
|
||||
data_handle_,
|
||||
[](void *handle, XGBoostBatchCSR batch) -> int {
|
||||
API_BEGIN();
|
||||
static_cast<IteratorAdapter *>(handle)->SetData(batch);
|
||||
API_END();
|
||||
},
|
||||
this) != 0) {
|
||||
at_first_ = false;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template class IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
|
||||
} // namespace xgboost::data
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright (c) 2019~2021 by Contributors
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
* \file adapter.h
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_ADAPTER_H_
|
||||
@@ -16,11 +16,9 @@
|
||||
#include <utility> // std::move
|
||||
#include <vector>
|
||||
|
||||
#include "../c_api/c_api_error.h"
|
||||
#include "../common/error_msg.h" // for MaxFeatureSize
|
||||
#include "../common/math.h"
|
||||
#include "array_interface.h"
|
||||
#include "arrow-cdi.h"
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/logging.h"
|
||||
@@ -743,8 +741,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
|
||||
dmlc::Parser<uint32_t>* parser_;
|
||||
};
|
||||
|
||||
/*! \brief Data iterator that takes callback to return data, used in JVM package for
|
||||
* accepting data iterator. */
|
||||
/**
|
||||
* @brief Data iterator that takes callback to return data, used in JVM package for accepting data
|
||||
* iterator.
|
||||
*/
|
||||
template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
|
||||
class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
|
||||
public:
|
||||
@@ -758,23 +758,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
|
||||
CHECK(at_first_) << "Cannot reset IteratorAdapter";
|
||||
}
|
||||
|
||||
bool Next() override {
|
||||
if ((*next_callback_)(
|
||||
data_handle_,
|
||||
[](void *handle, XGBoostBatchCSR batch) -> int {
|
||||
API_BEGIN();
|
||||
static_cast<IteratorAdapter *>(handle)->SetData(batch);
|
||||
API_END();
|
||||
},
|
||||
this) != 0) {
|
||||
at_first_ = false;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
[[nodiscard]] bool Next() override;
|
||||
|
||||
FileAdapterBatch const& Value() const override {
|
||||
[[nodiscard]] FileAdapterBatch const& Value() const override {
|
||||
return *batch_.get();
|
||||
}
|
||||
|
||||
@@ -822,12 +808,12 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
|
||||
block_.index = dmlc::BeginPtr(index_);
|
||||
block_.value = dmlc::BeginPtr(value_);
|
||||
|
||||
batch_.reset(new FileAdapterBatch(&block_, row_offset_));
|
||||
batch_ = std::make_unique<FileAdapterBatch>(&block_, row_offset_);
|
||||
row_offset_ += offset_.size() - 1;
|
||||
}
|
||||
|
||||
size_t NumColumns() const { return columns_; }
|
||||
size_t NumRows() const { return kAdapterUnknownSize; }
|
||||
[[nodiscard]] std::size_t NumColumns() const { return columns_; }
|
||||
[[nodiscard]] std::size_t NumRows() const { return kAdapterUnknownSize; }
|
||||
|
||||
private:
|
||||
std::vector<size_t> offset_;
|
||||
@@ -849,356 +835,6 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
|
||||
std::unique_ptr<FileAdapterBatch> batch_;
|
||||
};
|
||||
|
||||
enum ColumnDType : uint8_t {
|
||||
kUnknown,
|
||||
kInt8,
|
||||
kUInt8,
|
||||
kInt16,
|
||||
kUInt16,
|
||||
kInt32,
|
||||
kUInt32,
|
||||
kInt64,
|
||||
kUInt64,
|
||||
kFloat,
|
||||
kDouble
|
||||
};
|
||||
|
||||
class Column {
|
||||
public:
|
||||
Column() = default;
|
||||
|
||||
Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
|
||||
: col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
|
||||
|
||||
virtual ~Column() = default;
|
||||
|
||||
Column(const Column&) = delete;
|
||||
Column& operator=(const Column&) = delete;
|
||||
Column(Column&&) = delete;
|
||||
Column& operator=(Column&&) = delete;
|
||||
|
||||
// whether the valid bit is set for this element
|
||||
bool IsValid(size_t row_idx) const {
|
||||
return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
|
||||
}
|
||||
|
||||
virtual COOTuple GetElement(size_t row_idx) const = 0;
|
||||
|
||||
virtual bool IsValidElement(size_t row_idx) const = 0;
|
||||
|
||||
virtual std::vector<float> AsFloatVector() const = 0;
|
||||
|
||||
virtual std::vector<uint64_t> AsUint64Vector() const = 0;
|
||||
|
||||
size_t Length() const { return length_; }
|
||||
|
||||
protected:
|
||||
size_t col_idx_;
|
||||
size_t length_;
|
||||
size_t null_count_;
|
||||
const uint8_t* bitmap_;
|
||||
};
|
||||
|
||||
// Only columns of primitive types are supported. An ArrowColumnarBatch is a
|
||||
// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
|
||||
// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
|
||||
// derive from the abstract class Column.
|
||||
template <typename T>
|
||||
class PrimitiveColumn : public Column {
|
||||
static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
|
||||
|
||||
public:
|
||||
PrimitiveColumn(size_t idx, size_t length, size_t null_count,
|
||||
const uint8_t* bitmap, const T* data, float missing)
|
||||
: Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
|
||||
|
||||
COOTuple GetElement(size_t row_idx) const override {
|
||||
CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
|
||||
return { row_idx, col_idx_, IsValidElement(row_idx) ?
|
||||
static_cast<float>(data_[row_idx]) : kNaN };
|
||||
}
|
||||
|
||||
bool IsValidElement(size_t row_idx) const override {
|
||||
// std::isfinite needs to cast to double to prevent msvc report error
|
||||
return IsValid(row_idx)
|
||||
&& std::isfinite(static_cast<double>(data_[row_idx]))
|
||||
&& static_cast<float>(data_[row_idx]) != missing_;
|
||||
}
|
||||
|
||||
std::vector<float> AsFloatVector() const override {
|
||||
CHECK(data_) << "Column is empty";
|
||||
std::vector<float> fv(length_);
|
||||
std::transform(data_, data_ + length_, fv.begin(),
|
||||
[](T v) { return static_cast<float>(v); });
|
||||
return fv;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> AsUint64Vector() const override {
|
||||
CHECK(data_) << "Column is empty";
|
||||
std::vector<uint64_t> iv(length_);
|
||||
std::transform(data_, data_ + length_, iv.begin(),
|
||||
[](T v) { return static_cast<uint64_t>(v); });
|
||||
return iv;
|
||||
}
|
||||
|
||||
private:
|
||||
const T* data_;
|
||||
float missing_; // user specified missing value
|
||||
};
|
||||
|
||||
struct ColumnarMetaInfo {
|
||||
// data type of the column
|
||||
ColumnDType type{ColumnDType::kUnknown};
|
||||
// location of the column in an Arrow record batch
|
||||
int64_t loc{-1};
|
||||
};
|
||||
|
||||
struct ArrowSchemaImporter {
|
||||
std::vector<ColumnarMetaInfo> columns;
|
||||
|
||||
// map Arrow format strings to types
|
||||
static ColumnDType FormatMap(char const* format_str) {
|
||||
CHECK(format_str) << "Format string cannot be empty";
|
||||
switch (format_str[0]) {
|
||||
case 'c':
|
||||
return ColumnDType::kInt8;
|
||||
case 'C':
|
||||
return ColumnDType::kUInt8;
|
||||
case 's':
|
||||
return ColumnDType::kInt16;
|
||||
case 'S':
|
||||
return ColumnDType::kUInt16;
|
||||
case 'i':
|
||||
return ColumnDType::kInt32;
|
||||
case 'I':
|
||||
return ColumnDType::kUInt32;
|
||||
case 'l':
|
||||
return ColumnDType::kInt64;
|
||||
case 'L':
|
||||
return ColumnDType::kUInt64;
|
||||
case 'f':
|
||||
return ColumnDType::kFloat;
|
||||
case 'g':
|
||||
return ColumnDType::kDouble;
|
||||
default:
|
||||
CHECK(false) << "Column data type not supported by XGBoost";
|
||||
return ColumnDType::kUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
void Import(struct ArrowSchema *schema) {
|
||||
if (schema) {
|
||||
CHECK(std::string(schema->format) == "+s"); // NOLINT
|
||||
CHECK(columns.empty());
|
||||
for (auto i = 0; i < schema->n_children; ++i) {
|
||||
std::string name{schema->children[i]->name};
|
||||
ColumnDType type = FormatMap(schema->children[i]->format);
|
||||
ColumnarMetaInfo col_info{type, i};
|
||||
columns.push_back(col_info);
|
||||
}
|
||||
if (schema->release) {
|
||||
schema->release(schema);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class ArrowColumnarBatch {
|
||||
public:
|
||||
ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
|
||||
: rb_{rb}, schema_{schema} {
|
||||
CHECK(rb_) << "Cannot import non-existent record batch";
|
||||
CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
|
||||
}
|
||||
|
||||
size_t Import(float missing) {
|
||||
auto& infov = schema_->columns;
|
||||
for (size_t i = 0; i < infov.size(); ++i) {
|
||||
columns_.push_back(CreateColumn(i, infov[i], missing));
|
||||
}
|
||||
|
||||
// Compute the starting location for every row in this batch
|
||||
auto batch_size = rb_->length;
|
||||
auto num_columns = columns_.size();
|
||||
row_offsets_.resize(batch_size + 1, 0);
|
||||
for (auto i = 0; i < batch_size; ++i) {
|
||||
row_offsets_[i+1] = row_offsets_[i];
|
||||
for (size_t j = 0; j < num_columns; ++j) {
|
||||
if (GetColumn(j).IsValidElement(i)) {
|
||||
row_offsets_[i+1]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// return number of elements in the batch
|
||||
return row_offsets_.back();
|
||||
}
|
||||
|
||||
ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
|
||||
ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
|
||||
ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
|
||||
ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
|
||||
|
||||
virtual ~ArrowColumnarBatch() {
|
||||
if (rb_ && rb_->release) {
|
||||
rb_->release(rb_);
|
||||
rb_ = nullptr;
|
||||
}
|
||||
columns_.clear();
|
||||
}
|
||||
|
||||
size_t Size() const { return rb_ ? rb_->length : 0; }
|
||||
|
||||
size_t NumColumns() const { return columns_.size(); }
|
||||
|
||||
size_t NumElements() const { return row_offsets_.back(); }
|
||||
|
||||
const Column& GetColumn(size_t col_idx) const {
|
||||
return *columns_[col_idx];
|
||||
}
|
||||
|
||||
void ShiftRowOffsets(size_t batch_offset) {
|
||||
std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
|
||||
[=](size_t c) { return c + batch_offset; });
|
||||
}
|
||||
|
||||
const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Column> CreateColumn(size_t idx,
|
||||
ColumnarMetaInfo info,
|
||||
float missing) const {
|
||||
if (info.loc < 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto loc_in_batch = info.loc;
|
||||
auto length = rb_->length;
|
||||
auto null_count = rb_->null_count;
|
||||
auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
|
||||
auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
|
||||
const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
|
||||
const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
|
||||
|
||||
// if null_count is not computed, compute it here
|
||||
if (null_count < 0) {
|
||||
if (!bitmap) {
|
||||
null_count = 0;
|
||||
} else {
|
||||
null_count = length;
|
||||
for (auto i = 0; i < length; ++i) {
|
||||
if (bitmap[i/8] & (1 << (i%8))) {
|
||||
null_count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (info.type) {
|
||||
case ColumnDType::kInt8:
|
||||
return std::make_shared<PrimitiveColumn<int8_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const int8_t*>(data), missing);
|
||||
case ColumnDType::kUInt8:
|
||||
return std::make_shared<PrimitiveColumn<uint8_t>>(
|
||||
idx, length, null_count, bitmap, data, missing);
|
||||
case ColumnDType::kInt16:
|
||||
return std::make_shared<PrimitiveColumn<int16_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const int16_t*>(data), missing);
|
||||
case ColumnDType::kUInt16:
|
||||
return std::make_shared<PrimitiveColumn<uint16_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const uint16_t*>(data), missing);
|
||||
case ColumnDType::kInt32:
|
||||
return std::make_shared<PrimitiveColumn<int32_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const int32_t*>(data), missing);
|
||||
case ColumnDType::kUInt32:
|
||||
return std::make_shared<PrimitiveColumn<uint32_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const uint32_t*>(data), missing);
|
||||
case ColumnDType::kInt64:
|
||||
return std::make_shared<PrimitiveColumn<int64_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const int64_t*>(data), missing);
|
||||
case ColumnDType::kUInt64:
|
||||
return std::make_shared<PrimitiveColumn<uint64_t>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const uint64_t*>(data), missing);
|
||||
case ColumnDType::kFloat:
|
||||
return std::make_shared<PrimitiveColumn<float>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const float*>(data), missing);
|
||||
case ColumnDType::kDouble:
|
||||
return std::make_shared<PrimitiveColumn<double>>(
|
||||
idx, length, null_count, bitmap,
|
||||
reinterpret_cast<const double*>(data), missing);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
struct ArrowArray* rb_;
|
||||
struct ArrowSchemaImporter* schema_;
|
||||
std::vector<std::shared_ptr<Column>> columns_;
|
||||
std::vector<size_t> row_offsets_;
|
||||
};
|
||||
|
||||
using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
|
||||
class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
|
||||
public:
|
||||
RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
|
||||
: next_callback_{next_callback}, nbatches_{nbatch} {}
|
||||
|
||||
void BeforeFirst() override {
|
||||
CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
|
||||
}
|
||||
|
||||
bool Next() override {
|
||||
batches_.clear();
|
||||
while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
|
||||
at_first_ = false;
|
||||
}
|
||||
|
||||
if (batches_.size() > 0) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
|
||||
// Schema is only imported once at the beginning, regardless how many
|
||||
// baches are comming.
|
||||
// But even schema is not imported we still need to release its C data
|
||||
// exported from Arrow.
|
||||
if (at_first_ && schema) {
|
||||
schema_.Import(schema);
|
||||
} else {
|
||||
if (schema && schema->release) {
|
||||
schema->release(schema);
|
||||
}
|
||||
}
|
||||
if (rb) {
|
||||
batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
|
||||
}
|
||||
}
|
||||
|
||||
const ArrowColumnarBatchVec& Value() const override {
|
||||
return batches_;
|
||||
}
|
||||
|
||||
size_t NumColumns() const { return schema_.columns.size(); }
|
||||
size_t NumRows() const { return kAdapterUnknownSize; }
|
||||
|
||||
private:
|
||||
XGDMatrixCallbackNext *next_callback_;
|
||||
bool at_first_{true};
|
||||
int nbatches_;
|
||||
struct ArrowSchemaImporter schema_;
|
||||
ArrowColumnarBatchVec batches_;
|
||||
};
|
||||
|
||||
class SparsePageAdapterBatch {
|
||||
HostSparsePageView page_;
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/bitfield.h"
|
||||
#include "../common/bitfield.h" // for RBitField8
|
||||
#include "../common/common.h"
|
||||
#include "../common/error_msg.h" // for NoF128
|
||||
#include "xgboost/base.h"
|
||||
@@ -106,7 +106,20 @@ struct ArrayInterfaceErrors {
|
||||
*/
|
||||
class ArrayInterfaceHandler {
|
||||
public:
|
||||
enum Type : std::int8_t { kF2, kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
|
||||
enum Type : std::int8_t {
|
||||
kF2 = 0,
|
||||
kF4 = 1,
|
||||
kF8 = 2,
|
||||
kF16 = 3,
|
||||
kI1 = 4,
|
||||
kI2 = 5,
|
||||
kI4 = 6,
|
||||
kI8 = 7,
|
||||
kU1 = 8,
|
||||
kU2 = 9,
|
||||
kU4 = 10,
|
||||
kU8 = 11,
|
||||
};
|
||||
|
||||
template <typename PtrType>
|
||||
static PtrType GetPtrFromArrayData(Object::Map const &obj) {
|
||||
@@ -589,6 +602,57 @@ class ArrayInterface {
|
||||
ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
|
||||
};
|
||||
|
||||
template <typename Fn>
|
||||
auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
|
||||
switch (dtype) {
|
||||
case ArrayInterfaceHandler::kF2: {
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
|
||||
return dispatch(__half{});
|
||||
#else
|
||||
LOG(FATAL) << "half type is only supported for CUDA input.";
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
case ArrayInterfaceHandler::kF4: {
|
||||
return dispatch(float{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kF8: {
|
||||
return dispatch(double{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kF16: {
|
||||
using T = long double;
|
||||
CHECK(sizeof(T) == 16) << error::NoF128();
|
||||
return dispatch(T{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kI1: {
|
||||
return dispatch(std::int8_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kI2: {
|
||||
return dispatch(std::int16_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kI4: {
|
||||
return dispatch(std::int32_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kI8: {
|
||||
return dispatch(std::int64_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kU1: {
|
||||
return dispatch(std::uint8_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kU2: {
|
||||
return dispatch(std::uint16_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kU4: {
|
||||
return dispatch(std::uint32_t{});
|
||||
}
|
||||
case ArrayInterfaceHandler::kU8: {
|
||||
return dispatch(std::uint64_t{});
|
||||
}
|
||||
}
|
||||
|
||||
return std::result_of_t<Fn(std::int8_t)>();
|
||||
}
|
||||
|
||||
template <std::int32_t D, typename Fn>
|
||||
void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
|
||||
// Only used for cuDF at the moment.
|
||||
@@ -604,60 +668,7 @@ void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
|
||||
std::numeric_limits<std::size_t>::max()},
|
||||
array.shape, array.strides, device});
|
||||
};
|
||||
switch (array.type) {
|
||||
case ArrayInterfaceHandler::kF2: {
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
|
||||
dispatch(__half{});
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kF4: {
|
||||
dispatch(float{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kF8: {
|
||||
dispatch(double{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kF16: {
|
||||
using T = long double;
|
||||
CHECK(sizeof(long double) == 16) << error::NoF128();
|
||||
dispatch(T{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kI1: {
|
||||
dispatch(std::int8_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kI2: {
|
||||
dispatch(std::int16_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kI4: {
|
||||
dispatch(std::int32_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kI8: {
|
||||
dispatch(std::int64_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kU1: {
|
||||
dispatch(std::uint8_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kU2: {
|
||||
dispatch(std::uint16_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kU4: {
|
||||
dispatch(std::uint32_t{});
|
||||
break;
|
||||
}
|
||||
case ArrayInterfaceHandler::kU8: {
|
||||
dispatch(std::uint64_t{});
|
||||
break;
|
||||
}
|
||||
}
|
||||
DispatchDType(array.type, dispatch);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
/* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ARROW_FLAG_DICTIONARY_ORDERED 1
|
||||
#define ARROW_FLAG_NULLABLE 2
|
||||
#define ARROW_FLAG_MAP_KEYS_SORTED 4
|
||||
|
||||
struct ArrowSchema {
|
||||
// Array type description
|
||||
const char* format;
|
||||
const char* name;
|
||||
const char* metadata;
|
||||
int64_t flags;
|
||||
int64_t n_children;
|
||||
struct ArrowSchema** children;
|
||||
struct ArrowSchema* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowSchema*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
struct ArrowArray {
|
||||
// Array data description
|
||||
int64_t length;
|
||||
int64_t null_count;
|
||||
int64_t offset;
|
||||
int64_t n_buffers;
|
||||
int64_t n_children;
|
||||
const void** buffers;
|
||||
struct ArrowArray** children;
|
||||
struct ArrowArray* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowArray*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
104
src/data/data.cc
104
src/data/data.cc
@@ -635,22 +635,39 @@ void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
|
||||
}
|
||||
|
||||
void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
|
||||
if (size != 0 && this->num_col_ != 0) {
|
||||
if (size != 0 && this->num_col_ != 0 && !IsColumnSplit()) {
|
||||
CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
|
||||
CHECK(info);
|
||||
}
|
||||
if (!std::strcmp(key, "feature_type")) {
|
||||
feature_type_names.clear();
|
||||
auto& h_feature_types = feature_types.HostVector();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
auto elem = info[i];
|
||||
feature_type_names.emplace_back(elem);
|
||||
}
|
||||
if (IsColumnSplit()) {
|
||||
feature_type_names = collective::AllgatherStrings(feature_type_names);
|
||||
CHECK_EQ(feature_type_names.size(), num_col_)
|
||||
<< "Length of " << key << " must be equal to number of columns.";
|
||||
}
|
||||
auto& h_feature_types = feature_types.HostVector();
|
||||
LoadFeatureType(feature_type_names, &h_feature_types);
|
||||
} else if (!std::strcmp(key, "feature_name")) {
|
||||
feature_names.clear();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
feature_names.emplace_back(info[i]);
|
||||
if (IsColumnSplit()) {
|
||||
std::vector<std::string> local_feature_names{};
|
||||
auto const rank = collective::GetRank();
|
||||
for (std::size_t i = 0; i < size; ++i) {
|
||||
auto elem = std::to_string(rank) + "." + info[i];
|
||||
local_feature_names.emplace_back(elem);
|
||||
}
|
||||
feature_names = collective::AllgatherStrings(local_feature_names);
|
||||
CHECK_EQ(feature_names.size(), num_col_)
|
||||
<< "Length of " << key << " must be equal to number of columns.";
|
||||
} else {
|
||||
feature_names.clear();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
feature_names.emplace_back(info[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown feature info name: " << key;
|
||||
@@ -687,13 +704,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
|
||||
|
||||
linalg::Stack(&this->labels, that.labels);
|
||||
|
||||
this->weights_.SetDevice(that.weights_.DeviceIdx());
|
||||
this->weights_.SetDevice(that.weights_.Device());
|
||||
this->weights_.Extend(that.weights_);
|
||||
|
||||
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
|
||||
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
|
||||
this->labels_lower_bound_.Extend(that.labels_lower_bound_);
|
||||
|
||||
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
|
||||
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
|
||||
this->labels_upper_bound_.Extend(that.labels_upper_bound_);
|
||||
|
||||
linalg::Stack(&this->base_margin_, that.base_margin_);
|
||||
@@ -723,13 +740,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
|
||||
}
|
||||
if (!that.feature_weights.Empty()) {
|
||||
this->feature_weights.Resize(that.feature_weights.Size());
|
||||
this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
|
||||
this->feature_weights.SetDevice(that.feature_weights.Device());
|
||||
this->feature_weights.Copy(that.feature_weights);
|
||||
}
|
||||
}
|
||||
|
||||
void MetaInfo::SynchronizeNumberOfColumns() {
|
||||
if (IsVerticalFederated()) {
|
||||
if (IsColumnSplit()) {
|
||||
collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
|
||||
} else {
|
||||
collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
|
||||
@@ -738,22 +755,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
|
||||
bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
|
||||
void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
|
||||
bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
|
||||
if (!valid) {
|
||||
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
|
||||
"the booster. The device ordinal of the data is: "
|
||||
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
|
||||
<< v.Device() << "; the device ordinal of the Booster is: " << device;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, std::int32_t D>
|
||||
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
|
||||
void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
|
||||
CheckDevice(device, *v.Data());
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
void MetaInfo::Validate(std::int32_t device) const {
|
||||
void MetaInfo::Validate(DeviceOrd device) const {
|
||||
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
|
||||
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
|
||||
return;
|
||||
@@ -850,14 +867,6 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
|
||||
} // namespace
|
||||
|
||||
DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
|
||||
auto need_split = false;
|
||||
if (collective::IsFederated()) {
|
||||
LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
|
||||
} else if (collective::IsDistributed()) {
|
||||
LOG(CONSOLE) << "XGBoost distributed mode detected, will split data among workers";
|
||||
need_split = true;
|
||||
}
|
||||
|
||||
std::string fname, cache_file;
|
||||
auto dlm_pos = uri.find('#');
|
||||
if (dlm_pos != std::string::npos) {
|
||||
@@ -865,24 +874,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
|
||||
fname = uri.substr(0, dlm_pos);
|
||||
CHECK_EQ(cache_file.find('#'), std::string::npos)
|
||||
<< "Only one `#` is allowed in file path for cache file specification.";
|
||||
if (need_split && data_split_mode == DataSplitMode::kRow) {
|
||||
std::ostringstream os;
|
||||
std::vector<std::string> cache_shards = common::Split(cache_file, ':');
|
||||
for (size_t i = 0; i < cache_shards.size(); ++i) {
|
||||
size_t pos = cache_shards[i].rfind('.');
|
||||
if (pos == std::string::npos) {
|
||||
os << cache_shards[i] << ".r" << collective::GetRank() << "-"
|
||||
<< collective::GetWorldSize();
|
||||
} else {
|
||||
os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
|
||||
<< collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
|
||||
}
|
||||
if (i + 1 != cache_shards.size()) {
|
||||
os << ':';
|
||||
}
|
||||
}
|
||||
cache_file = os.str();
|
||||
}
|
||||
} else {
|
||||
fname = uri;
|
||||
}
|
||||
@@ -894,19 +885,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
|
||||
}
|
||||
|
||||
int partid = 0, npart = 1;
|
||||
if (need_split && data_split_mode == DataSplitMode::kRow) {
|
||||
partid = collective::GetRank();
|
||||
npart = collective::GetWorldSize();
|
||||
} else {
|
||||
// test option to load in part
|
||||
npart = 1;
|
||||
}
|
||||
|
||||
if (npart != 1) {
|
||||
LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
|
||||
}
|
||||
|
||||
DMatrix* dmat{nullptr};
|
||||
DMatrix* dmat{};
|
||||
|
||||
if (cache_file.empty()) {
|
||||
fname = data::ValidateFileFormat(fname);
|
||||
@@ -916,6 +895,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
|
||||
dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
|
||||
cache_file, data_split_mode);
|
||||
} else {
|
||||
CHECK(data_split_mode != DataSplitMode::kCol)
|
||||
<< "Column-wise data split is not supported for external memory.";
|
||||
data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
|
||||
dmat = new data::SparsePageDMatrix{&iter,
|
||||
iter.Proxy(),
|
||||
@@ -926,17 +907,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
|
||||
cache_file};
|
||||
}
|
||||
|
||||
if (need_split && data_split_mode == DataSplitMode::kCol) {
|
||||
if (!cache_file.empty()) {
|
||||
LOG(FATAL) << "Column-wise data split is not support for external memory.";
|
||||
}
|
||||
LOG(CONSOLE) << "Splitting data by column";
|
||||
auto* sliced = dmat->SliceCol(npart, partid);
|
||||
delete dmat;
|
||||
return sliced;
|
||||
} else {
|
||||
return dmat;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
|
||||
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
|
||||
@@ -1011,9 +982,6 @@ template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter*
|
||||
template DMatrix* DMatrix::Create(
|
||||
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
|
||||
float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
|
||||
template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
|
||||
data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
|
||||
DataSplitMode data_split_mode);
|
||||
|
||||
SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
|
||||
SparsePage transpose;
|
||||
|
||||
@@ -33,13 +33,13 @@ template <typename T, int32_t D>
|
||||
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
|
||||
ArrayInterface<D> array(arr_interface);
|
||||
if (array.n == 0) {
|
||||
p_out->SetDevice(0);
|
||||
p_out->SetDevice(DeviceOrd::CUDA(0));
|
||||
p_out->Reshape(array.shape);
|
||||
return;
|
||||
}
|
||||
CHECK_EQ(array.valid.Capacity(), 0)
|
||||
<< "Meta info like label or weight can not have missing value.";
|
||||
auto ptr_device = SetDeviceToPtr(array.data);
|
||||
auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
|
||||
p_out->SetDevice(ptr_device);
|
||||
|
||||
if (array.is_contiguous && array.type == ToDType<T>::kType) {
|
||||
@@ -55,7 +55,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
||||
return;
|
||||
}
|
||||
p_out->Reshape(array.shape);
|
||||
auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
|
||||
auto t = p_out->View(ptr_device);
|
||||
linalg::ElementWiseTransformDevice(
|
||||
t,
|
||||
[=] __device__(size_t i, T) {
|
||||
@@ -91,7 +91,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
||||
});
|
||||
dh::caching_device_vector<bool> flag(1);
|
||||
auto d_flag = dh::ToSpan(flag);
|
||||
auto d = SetDeviceToPtr(array_interface.data);
|
||||
auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
|
||||
dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
|
||||
dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
|
||||
auto typed = TypedIndex<uint32_t, 1>{array_interface};
|
||||
|
||||
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
|
||||
: columns_(columns),
|
||||
num_rows_(num_rows) {}
|
||||
size_t Size() const { return num_rows_ * columns_.size(); }
|
||||
__device__ __forceinline__ COOTuple GetElement(size_t idx) const {
|
||||
[[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
|
||||
[[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
|
||||
size_t column_idx = idx % columns_.size();
|
||||
size_t row_idx = idx / columns_.size();
|
||||
auto const& column = columns_[column_idx];
|
||||
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
return {row_idx, column_idx, value};
|
||||
}
|
||||
|
||||
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||
auto const& column = columns_[fidx];
|
||||
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
|
||||
? column(ridx)
|
||||
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
return value;
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
|
||||
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
|
||||
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
|
||||
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
|
||||
|
||||
private:
|
||||
common::Span<ArrayInterface<1>> columns_;
|
||||
@@ -120,16 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
return;
|
||||
}
|
||||
|
||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
||||
CHECK_NE(device_idx_, Context::kCpuId);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||
|
||||
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
|
||||
CHECK(device_.IsCUDA());
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
for (auto& json_col : json_columns) {
|
||||
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
||||
columns.push_back(column);
|
||||
num_rows_ = std::max(num_rows_, column.Shape(0));
|
||||
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
|
||||
CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
|
||||
<< "All columns should use the same device.";
|
||||
CHECK_EQ(num_rows_, column.Shape(0))
|
||||
<< "All columns should have same number of rows.";
|
||||
@@ -145,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
return batch_;
|
||||
}
|
||||
|
||||
size_t NumRows() const { return num_rows_; }
|
||||
size_t NumColumns() const { return columns_.size(); }
|
||||
int32_t DeviceIdx() const { return device_idx_; }
|
||||
[[nodiscard]] std::size_t NumRows() const { return num_rows_; }
|
||||
[[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
|
||||
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||
|
||||
private:
|
||||
CudfAdapterBatch batch_;
|
||||
dh::device_vector<ArrayInterface<1>> columns_;
|
||||
size_t num_rows_{0};
|
||||
int32_t device_idx_{Context::kCpuId};
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
};
|
||||
|
||||
class CupyAdapterBatch : public detail::NoMetaInfo {
|
||||
@@ -161,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
|
||||
CupyAdapterBatch() = default;
|
||||
explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
|
||||
: array_interface_(std::move(array_interface)) {}
|
||||
size_t Size() const {
|
||||
[[nodiscard]] std::size_t Size() const {
|
||||
return array_interface_.Shape(0) * array_interface_.Shape(1);
|
||||
}
|
||||
__device__ COOTuple GetElement(size_t idx) const {
|
||||
[[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
|
||||
size_t column_idx = idx % array_interface_.Shape(1);
|
||||
size_t row_idx = idx / array_interface_.Shape(1);
|
||||
float value = array_interface_(row_idx, column_idx);
|
||||
return {row_idx, column_idx, value};
|
||||
}
|
||||
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||
float value = array_interface_(ridx, fidx);
|
||||
return value;
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
|
||||
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
|
||||
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
|
||||
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
|
||||
|
||||
private:
|
||||
ArrayInterface<2> array_interface_;
|
||||
@@ -191,29 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
|
||||
if (array_interface_.Shape(0) == 0) {
|
||||
return;
|
||||
}
|
||||
device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
|
||||
CHECK_NE(device_idx_, Context::kCpuId);
|
||||
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
|
||||
CHECK(device_.IsCUDA());
|
||||
}
|
||||
explicit CupyAdapter(std::string cuda_interface_str)
|
||||
: CupyAdapter{StringView{cuda_interface_str}} {}
|
||||
const CupyAdapterBatch& Value() const override { return batch_; }
|
||||
[[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
|
||||
|
||||
size_t NumRows() const { return array_interface_.Shape(0); }
|
||||
size_t NumColumns() const { return array_interface_.Shape(1); }
|
||||
int32_t DeviceIdx() const { return device_idx_; }
|
||||
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
|
||||
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
|
||||
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||
|
||||
private:
|
||||
ArrayInterface<2> array_interface_;
|
||||
CupyAdapterBatch batch_;
|
||||
int32_t device_idx_ {Context::kCpuId};
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
};
|
||||
|
||||
// Returns maximum row length
|
||||
template <typename AdapterBatchT>
|
||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
|
||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
|
||||
float missing) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
IsValidFunctor is_valid(missing);
|
||||
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||
|
||||
|
||||
@@ -98,23 +98,18 @@ __global__ void CompressBinEllpackKernel(
|
||||
}
|
||||
|
||||
// Construct an ELLPACK matrix with the given number of empty rows.
|
||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
bool is_dense, size_t row_stride,
|
||||
size_t n_rows)
|
||||
: is_dense(is_dense),
|
||||
cuts_(std::move(cuts)),
|
||||
row_stride(row_stride),
|
||||
n_rows(n_rows) {
|
||||
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
|
||||
size_t row_stride, size_t n_rows)
|
||||
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
|
||||
monitor_.Init("ellpack_page");
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(device);
|
||||
monitor_.Stop("InitCompressedData");
|
||||
}
|
||||
|
||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
|
||||
const SparsePage &page, bool is_dense,
|
||||
size_t row_stride,
|
||||
common::Span<FeatureType const> feature_types)
|
||||
@@ -128,7 +123,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
|
||||
: is_dense(dmat->IsDense()) {
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
|
||||
n_rows = dmat->Info().num_row_;
|
||||
|
||||
@@ -143,15 +138,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
|
||||
monitor_.Stop("Quantiles");
|
||||
|
||||
monitor_.Start("InitCompressedData");
|
||||
this->InitCompressedData(ctx->gpu_id);
|
||||
this->InitCompressedData(ctx->Device());
|
||||
monitor_.Stop("InitCompressedData");
|
||||
|
||||
dmat->Info().feature_types.SetDevice(ctx->gpu_id);
|
||||
dmat->Info().feature_types.SetDevice(ctx->Device());
|
||||
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
||||
monitor_.Start("BinningCompression");
|
||||
CHECK(dmat->SingleColBlock());
|
||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
CreateHistIndices(ctx->gpu_id, batch, ft);
|
||||
CreateHistIndices(ctx->Device(), batch, ft);
|
||||
}
|
||||
monitor_.Stop("BinningCompression");
|
||||
}
|
||||
@@ -214,7 +209,7 @@ struct TupleScanOp {
|
||||
// to remove missing data
|
||||
template <typename AdapterBatchT>
|
||||
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
|
||||
EllpackPageImpl* dst, int device_idx, float missing) {
|
||||
EllpackPageImpl* dst, DeviceOrd device, float missing) {
|
||||
// Some witchcraft happens here
|
||||
// The goal is to copy valid elements out of the input to an ELLPACK matrix
|
||||
// with a given row stride, using no extra working memory Standard stream
|
||||
@@ -246,7 +241,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
||||
// Tuple[2] = The index in the input data
|
||||
using Tuple = thrust::tuple<size_t, size_t, size_t>;
|
||||
|
||||
auto device_accessor = dst->GetDeviceAccessor(device_idx);
|
||||
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||
|
||||
@@ -298,10 +293,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
||||
#endif
|
||||
}
|
||||
|
||||
void WriteNullValues(EllpackPageImpl* dst, int device_idx,
|
||||
common::Span<size_t> row_counts) {
|
||||
void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
|
||||
// Write the null values
|
||||
auto device_accessor = dst->GetDeviceAccessor(device_idx);
|
||||
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||
auto row_stride = dst->row_stride;
|
||||
@@ -318,11 +312,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
|
||||
}
|
||||
|
||||
template <typename AdapterBatch>
|
||||
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
|
||||
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||
common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, common::HistogramCuts const& cuts) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
|
||||
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
||||
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
||||
@@ -331,7 +325,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
|
||||
|
||||
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
|
||||
template EllpackPageImpl::EllpackPageImpl( \
|
||||
__BATCH_T batch, float missing, int device, bool is_dense, \
|
||||
__BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \
|
||||
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
|
||||
size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
|
||||
|
||||
@@ -388,9 +382,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
[&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
|
||||
row_stride = *std::max_element(it, it + page.Size());
|
||||
|
||||
CHECK_GE(ctx->gpu_id, 0);
|
||||
CHECK(ctx->IsCUDA());
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(ctx->gpu_id);
|
||||
InitCompressedData(ctx->Device());
|
||||
monitor_.Stop("InitCompressedData");
|
||||
|
||||
// copy gidx
|
||||
@@ -400,7 +394,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||
|
||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
||||
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
|
||||
auto null = accessor.NullValue();
|
||||
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
||||
}
|
||||
@@ -425,8 +419,7 @@ struct CopyPage {
|
||||
};
|
||||
|
||||
// Copy the data from the given EllpackPage to the current page.
|
||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
|
||||
size_t offset) {
|
||||
size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
|
||||
monitor_.Start("Copy");
|
||||
size_t num_elements = page->n_rows * page->row_stride;
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
@@ -486,7 +479,7 @@ struct CompactPage {
|
||||
};
|
||||
|
||||
// Compacts the data from the given EllpackPage into the current page.
|
||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
|
||||
void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
|
||||
common::Span<size_t> row_indexes) {
|
||||
monitor_.Start("Compact");
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
@@ -499,13 +492,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
|
||||
}
|
||||
|
||||
// Initialize the buffer to stored compressed features.
|
||||
void EllpackPageImpl::InitCompressedData(int device) {
|
||||
void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
|
||||
size_t num_symbols = NumSymbols();
|
||||
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes =
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
|
||||
num_symbols);
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
|
||||
gidx_buffer.SetDevice(device);
|
||||
// Don't call fill unnecessarily
|
||||
if (gidx_buffer.Size() == 0) {
|
||||
@@ -517,7 +509,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
|
||||
}
|
||||
|
||||
// Compress a CSR page into ELLPACK.
|
||||
void EllpackPageImpl::CreateHistIndices(int device,
|
||||
void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
|
||||
const SparsePage& row_batch,
|
||||
common::Span<FeatureType const> feature_types) {
|
||||
if (row_batch.Size() == 0) return;
|
||||
@@ -527,7 +519,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
|
||||
// bin and compress entries in batches of rows
|
||||
size_t gpu_batch_nrows =
|
||||
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
||||
std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
|
||||
static_cast<size_t>(row_batch.Size()));
|
||||
|
||||
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
|
||||
@@ -592,7 +584,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
|
||||
}
|
||||
|
||||
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
||||
int device, common::Span<FeatureType const> feature_types) const {
|
||||
DeviceOrd device, common::Span<FeatureType const> feature_types) const {
|
||||
gidx_buffer.SetDevice(device);
|
||||
return {device,
|
||||
cuts_,
|
||||
@@ -606,7 +598,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
||||
}
|
||||
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
|
||||
common::Span<FeatureType const> feature_types) const {
|
||||
return {Context::kCpuId,
|
||||
return {DeviceOrd::CPU(),
|
||||
cuts_,
|
||||
is_dense,
|
||||
row_stride,
|
||||
|
||||
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
|
||||
|
||||
common::Span<const FeatureType> feature_types;
|
||||
|
||||
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
|
||||
bool is_dense, size_t row_stride, size_t base_rowid,
|
||||
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
|
||||
EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
|
||||
size_t row_stride, size_t base_rowid, size_t n_rows,
|
||||
common::CompressedIterator<uint32_t> gidx_iter,
|
||||
common::Span<FeatureType const> feature_types)
|
||||
: is_dense(is_dense),
|
||||
row_stride(row_stride),
|
||||
base_rowid(base_rowid),
|
||||
n_rows(n_rows) ,gidx_iter(gidx_iter),
|
||||
n_rows(n_rows),
|
||||
gidx_iter(gidx_iter),
|
||||
feature_types{feature_types} {
|
||||
if (device == Context::kCpuId) {
|
||||
if (device.IsCPU()) {
|
||||
gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
|
||||
feature_segments = cuts.cut_ptrs_.ConstHostSpan();
|
||||
min_fvalue = cuts.min_vals_.ConstHostSpan();
|
||||
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
|
||||
}
|
||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||
// Given a row index and a feature index, returns the corresponding cut value
|
||||
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
||||
[[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
||||
ridx -= base_rowid;
|
||||
auto row_begin = row_stride * ridx;
|
||||
auto row_end = row_begin + row_stride;
|
||||
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
|
||||
}
|
||||
|
||||
template <bool is_cat>
|
||||
__device__ uint32_t SearchBin(float value, size_t column_id) const {
|
||||
[[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
|
||||
auto beg = feature_segments[column_id];
|
||||
auto end = feature_segments[column_id + 1];
|
||||
uint32_t idx = 0;
|
||||
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
|
||||
return idx;
|
||||
}
|
||||
|
||||
__device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
|
||||
[[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
|
||||
auto gidx = GetBinIndex(ridx, fidx);
|
||||
if (gidx == -1) {
|
||||
return nan("");
|
||||
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
|
||||
}
|
||||
|
||||
// Check if the row id is withing range of the current batch.
|
||||
__device__ bool IsInRange(size_t row_id) const {
|
||||
[[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
|
||||
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
||||
}
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||
* not found). */
|
||||
XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
||||
[[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
||||
|
||||
XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
|
||||
[[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
|
||||
|
||||
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
||||
[[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
||||
|
||||
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
||||
[[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
||||
};
|
||||
|
||||
|
||||
@@ -141,14 +142,13 @@ class EllpackPageImpl {
|
||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
||||
* and the given number of rows.
|
||||
*/
|
||||
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
|
||||
size_t row_stride, size_t n_rows);
|
||||
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
|
||||
size_t n_rows);
|
||||
/*!
|
||||
* \brief Constructor used for external memory.
|
||||
*/
|
||||
EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
const SparsePage &page, bool is_dense, size_t row_stride,
|
||||
common::Span<FeatureType const> feature_types);
|
||||
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
|
||||
bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
|
||||
|
||||
/*!
|
||||
* \brief Constructor from an existing DMatrix.
|
||||
@@ -159,7 +159,7 @@ class EllpackPageImpl {
|
||||
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
|
||||
|
||||
template <typename AdapterBatch>
|
||||
explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
|
||||
explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||
common::Span<size_t> row_counts_span,
|
||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||
size_t n_rows, common::HistogramCuts const& cuts);
|
||||
@@ -176,7 +176,7 @@ class EllpackPageImpl {
|
||||
* @param offset The number of elements to skip before copying.
|
||||
* @returns The number of elements copied.
|
||||
*/
|
||||
size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
|
||||
size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
|
||||
|
||||
/*! \brief Compact the given ELLPACK page into the current page.
|
||||
*
|
||||
@@ -184,11 +184,10 @@ class EllpackPageImpl {
|
||||
* @param page The ELLPACK page to compact from.
|
||||
* @param row_indexes Row indexes for the compacted page.
|
||||
*/
|
||||
void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
||||
|
||||
void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
||||
|
||||
/*! \return Number of instances in the page. */
|
||||
size_t Size() const;
|
||||
[[nodiscard]] size_t Size() const;
|
||||
|
||||
/*! \brief Set the base row id for this page. */
|
||||
void SetBaseRowId(std::size_t row_id) {
|
||||
@@ -204,12 +203,12 @@ class EllpackPageImpl {
|
||||
|
||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||
* not found). */
|
||||
size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
||||
[[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
||||
|
||||
EllpackDeviceAccessor
|
||||
GetDeviceAccessor(int device,
|
||||
common::Span<FeatureType const> feature_types = {}) const;
|
||||
EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
|
||||
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
|
||||
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
|
||||
[[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
|
||||
common::Span<FeatureType const> feature_types = {}) const;
|
||||
|
||||
private:
|
||||
/*!
|
||||
@@ -218,13 +217,13 @@ class EllpackPageImpl {
|
||||
* @param device The GPU device to use.
|
||||
* @param row_batch The CSR page.
|
||||
*/
|
||||
void CreateHistIndices(int device,
|
||||
void CreateHistIndices(DeviceOrd device,
|
||||
const SparsePage& row_batch,
|
||||
common::Span<FeatureType const> feature_types);
|
||||
/*!
|
||||
* \brief Initialize the buffer to store compressed features.
|
||||
*/
|
||||
void InitCompressedData(int device);
|
||||
void InitCompressedData(DeviceOrd device);
|
||||
|
||||
|
||||
public:
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
namespace xgboost::data {
|
||||
void EllpackPageSource::Fetch() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
if (!this->ReadCache()) {
|
||||
if (count_ != 0 && !sync_) {
|
||||
// source is initialized to be the 0th page during construction, so when count_ is 0
|
||||
|
||||
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
|
||||
BatchParam param_;
|
||||
common::Span<FeatureType const> feature_types_;
|
||||
std::unique_ptr<common::HistogramCuts> cuts_;
|
||||
std::int32_t device_;
|
||||
DeviceOrd device_;
|
||||
|
||||
public:
|
||||
EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
|
||||
std::shared_ptr<Cache> cache, BatchParam param,
|
||||
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
|
||||
common::Span<FeatureType const> feature_types,
|
||||
std::shared_ptr<SparsePageSource> source, std::int32_t device)
|
||||
std::shared_ptr<SparsePageSource> source, DeviceOrd device)
|
||||
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
|
||||
is_dense_{is_dense},
|
||||
row_stride_{row_stride},
|
||||
@@ -50,6 +50,7 @@ inline void EllpackPageSource::Fetch() {
|
||||
// silent the warning about unused variables.
|
||||
(void)(row_stride_);
|
||||
(void)(is_dense_);
|
||||
(void)(device_);
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
|
||||
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
|
||||
auto pctx = MakeProxy(proxy_)->Ctx();
|
||||
|
||||
Context ctx;
|
||||
ctx.UpdateAllowUnknown(
|
||||
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
|
||||
// hardcoded parameter.
|
||||
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
|
||||
|
||||
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
|
||||
return HostAdapterDispatch(proxy, [&](auto const& value) {
|
||||
size_t n_threads = ctx->Threads();
|
||||
size_t n_features = column_sizes.size();
|
||||
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
|
||||
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
|
||||
column_sizes_tloc.Data()->Fill(0ul);
|
||||
auto view = column_sizes_tloc.HostView();
|
||||
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
|
||||
|
||||
@@ -48,10 +48,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
int32_t current_device;
|
||||
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
|
||||
auto get_device = [&]() -> int32_t {
|
||||
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
|
||||
CHECK_NE(d, Context::kCpuId);
|
||||
auto get_device = [&]() {
|
||||
auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
|
||||
CHECK(!d.IsCPU());
|
||||
return d;
|
||||
};
|
||||
|
||||
@@ -61,11 +60,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
common::HistogramCuts cuts;
|
||||
do {
|
||||
// We use do while here as the first batch is fetched in ctor
|
||||
// ctx_.gpu_id = proxy->DeviceIdx();
|
||||
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
|
||||
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||
if (cols == 0) {
|
||||
cols = num_cols();
|
||||
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
|
||||
@@ -103,8 +99,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
auto n_features = cols;
|
||||
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||
if (!ref) {
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
common::SketchContainer final_sketch(
|
||||
@@ -143,9 +138,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||
size_t n_batches_for_verification = 0;
|
||||
while (iter.Next()) {
|
||||
init_page();
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||
auto rows = num_rows();
|
||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||
@@ -197,18 +190,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
if (!ellpack_) {
|
||||
ellpack_.reset(new EllpackPage());
|
||||
if (ctx->IsCUDA()) {
|
||||
this->Info().feature_types.SetDevice(ctx->gpu_id);
|
||||
this->Info().feature_types.SetDevice(ctx->Device());
|
||||
*ellpack_->Impl() =
|
||||
EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||
} else if (fmat_ctx_.IsCUDA()) {
|
||||
this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
|
||||
this->Info().feature_types.SetDevice(fmat_ctx_.Device());
|
||||
*ellpack_->Impl() =
|
||||
EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||
} else {
|
||||
// Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
|
||||
// for cut reference.
|
||||
auto cuda_ctx = ctx->MakeCUDA();
|
||||
this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
|
||||
this->Info().feature_types.SetDevice(cuda_ctx.Device());
|
||||
*ellpack_->Impl() =
|
||||
EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||
}
|
||||
|
||||
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
|
||||
this->batch_ = adapter;
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
this->ctx_.gpu_id = Context::kCpuId;
|
||||
this->ctx_.Init(Args{{"device", "cpu"}});
|
||||
}
|
||||
|
||||
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
|
||||
char const *c_values, bst_feature_t n_features, bool on_host) {
|
||||
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
|
||||
bst_feature_t n_features, bool on_host) {
|
||||
CHECK(on_host) << "Not implemented on device.";
|
||||
std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
|
||||
StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
|
||||
this->batch_ = adapter;
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
this->ctx_.gpu_id = Context::kCpuId;
|
||||
this->ctx_.Init(Args{{"device", "cpu"}});
|
||||
}
|
||||
|
||||
namespace cuda_impl {
|
||||
|
||||
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
|
||||
this->batch_ = adapter;
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
if (adapter->DeviceIdx() < 0) {
|
||||
if (adapter->Device().IsCPU()) {
|
||||
// empty data
|
||||
CHECK_EQ(this->Info().num_row_, 0);
|
||||
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
||||
return;
|
||||
}
|
||||
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
|
||||
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
|
||||
}
|
||||
|
||||
void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
||||
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
||||
this->batch_ = adapter;
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
if (adapter->DeviceIdx() < 0) {
|
||||
if (adapter->Device().IsCPU()) {
|
||||
// empty data
|
||||
CHECK_EQ(this->Info().num_row_, 0);
|
||||
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
||||
return;
|
||||
}
|
||||
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
|
||||
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
|
||||
}
|
||||
|
||||
namespace cuda_impl {
|
||||
|
||||
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
|
||||
public:
|
||||
int DeviceIdx() const { return ctx_.gpu_id; }
|
||||
DeviceOrd Device() const { return ctx_.Device(); }
|
||||
|
||||
void SetCUDAArray(char const* c_interface) {
|
||||
common::AssertGPUSupport();
|
||||
|
||||
@@ -75,11 +75,9 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
|
||||
}
|
||||
|
||||
void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
|
||||
if (info_.IsVerticalFederated()) {
|
||||
std::vector<uint64_t> buffer(collective::GetWorldSize());
|
||||
buffer[collective::GetRank()] = info_.num_col_;
|
||||
collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
|
||||
auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
|
||||
if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
|
||||
auto const cols = collective::Allgather(info_.num_col_);
|
||||
auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
|
||||
if (offset == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -253,7 +251,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
|
||||
}
|
||||
if (batch.BaseMargin() != nullptr) {
|
||||
info_.base_margin_ = decltype(info_.base_margin_){
|
||||
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
|
||||
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
|
||||
}
|
||||
if (batch.Qid() != nullptr) {
|
||||
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
|
||||
@@ -361,78 +359,4 @@ template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int n
|
||||
template SimpleDMatrix::SimpleDMatrix(
|
||||
IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
|
||||
float missing, int nthread, DataSplitMode data_split_mode);
|
||||
|
||||
template <>
|
||||
SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
|
||||
DataSplitMode data_split_mode) {
|
||||
Context ctx;
|
||||
ctx.nthread = nthread;
|
||||
|
||||
auto& offset_vec = sparse_page_->offset.HostVector();
|
||||
auto& data_vec = sparse_page_->data.HostVector();
|
||||
uint64_t total_batch_size = 0;
|
||||
uint64_t total_elements = 0;
|
||||
|
||||
adapter->BeforeFirst();
|
||||
// Iterate over batches of input data
|
||||
while (adapter->Next()) {
|
||||
auto& batches = adapter->Value();
|
||||
size_t num_elements = 0;
|
||||
size_t num_rows = 0;
|
||||
// Import Arrow RecordBatches
|
||||
#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
|
||||
for (int i = 0; i < static_cast<int>(batches.size()); ++i) { // NOLINT
|
||||
num_elements += batches[i]->Import(missing);
|
||||
num_rows += batches[i]->Size();
|
||||
}
|
||||
total_elements += num_elements;
|
||||
total_batch_size += num_rows;
|
||||
// Compute global offset for every row and starting row for every batch
|
||||
std::vector<uint64_t> batch_offsets(batches.size());
|
||||
for (size_t i = 0; i < batches.size(); ++i) {
|
||||
if (i == 0) {
|
||||
batch_offsets[i] = total_batch_size - num_rows;
|
||||
batches[i]->ShiftRowOffsets(total_elements - num_elements);
|
||||
} else {
|
||||
batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
|
||||
batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
|
||||
}
|
||||
}
|
||||
// Pre-allocate DMatrix memory
|
||||
data_vec.resize(total_elements);
|
||||
offset_vec.resize(total_batch_size + 1);
|
||||
// Copy data into DMatrix
|
||||
#pragma omp parallel num_threads(ctx.Threads())
|
||||
{
|
||||
#pragma omp for nowait
|
||||
for (int i = 0; i < static_cast<int>(batches.size()); ++i) { // NOLINT
|
||||
size_t begin = batches[i]->RowOffsets()[0];
|
||||
for (size_t k = 0; k < batches[i]->Size(); ++k) {
|
||||
for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
|
||||
auto element = batches[i]->GetColumn(j).GetElement(k);
|
||||
if (!std::isnan(element.value)) {
|
||||
data_vec[begin++] = Entry(element.column_idx, element.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#pragma omp for nowait
|
||||
for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
|
||||
auto& offsets = batches[i]->RowOffsets();
|
||||
std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Synchronise worker columns
|
||||
info_.num_col_ = adapter->NumColumns();
|
||||
info_.data_split_mode = data_split_mode;
|
||||
ReindexFeatures(&ctx);
|
||||
info_.SynchronizeNumberOfColumns();
|
||||
|
||||
info_.num_row_ = total_batch_size;
|
||||
info_.num_nonzero_ = data_vec.size();
|
||||
CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
|
||||
|
||||
fmat_ctx_ = ctx;
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -10,9 +10,7 @@
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
// Does not currently support metainfo as no on-device data source contains this
|
||||
// Current implementation assumes a single batch. More batches can
|
||||
// be supported in future. Does not currently support inferring row/column size
|
||||
@@ -21,14 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
|
||||
DataSplitMode data_split_mode) {
|
||||
CHECK(data_split_mode != DataSplitMode::kCol)
|
||||
<< "Column-wise data split is currently not supported on the GPU.";
|
||||
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
|
||||
: adapter->DeviceIdx();
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
|
||||
? DeviceOrd::CUDA(dh::CurrentDevice())
|
||||
: adapter->Device();
|
||||
CHECK(device.IsCUDA());
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
|
||||
Context ctx;
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
|
||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
|
||||
|
||||
CHECK(adapter->NumRows() != kAdapterUnknownSize);
|
||||
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
|
||||
@@ -53,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
|
||||
int nthread, DataSplitMode data_split_mode);
|
||||
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
|
||||
int nthread, DataSplitMode data_split_mode);
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -54,11 +54,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
|
||||
}
|
||||
|
||||
template <typename AdapterBatchT>
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
int device_idx, float missing) {
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
|
||||
float missing) {
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
IsValidFunctor is_valid(missing);
|
||||
// Count elements per row
|
||||
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
|
||||
@@ -71,22 +69,19 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
||||
});
|
||||
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
thrust::exclusive_scan(thrust::hip::par(alloc),
|
||||
thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
thrust::device_pointer_cast(offset.data()));
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc),
|
||||
thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
thrust::device_pointer_cast(offset.data()));
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
thrust::device_pointer_cast(offset.data()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||
thrust::device_pointer_cast(offset.data()));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename AdapterBatchT>
|
||||
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
|
||||
size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
|
||||
SparsePage* page) {
|
||||
bool valid = NoInfInData(batch, IsValidFunctor{missing});
|
||||
CHECK(valid) << error::InfInData();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2015-2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2015-2023, XGBoost Contributors
|
||||
* \file simple_dmatrix.h
|
||||
* \brief In-memory version of DMatrix.
|
||||
* \author Tianqi Chen
|
||||
@@ -15,8 +15,7 @@
|
||||
|
||||
#include "gradient_index.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
namespace xgboost::data {
|
||||
// Used for single batch data.
|
||||
class SimpleDMatrix : public DMatrix {
|
||||
public:
|
||||
@@ -65,9 +64,10 @@ class SimpleDMatrix : public DMatrix {
|
||||
/**
|
||||
* \brief Reindex the features based on a global view.
|
||||
*
|
||||
* In some cases (e.g. vertical federated learning), features are loaded locally with indices
|
||||
* starting from 0. However, all the algorithms assume the features are globally indexed, so we
|
||||
* reindex the features based on the offset needed to obtain the global view.
|
||||
* In some cases (e.g. column-wise data split and vertical federated learning), features are
|
||||
* loaded locally with indices starting from 0. However, all the algorithms assume the features
|
||||
* are globally indexed, so we reindex the features based on the offset needed to obtain the
|
||||
* global view.
|
||||
*/
|
||||
void ReindexFeatures(Context const* ctx);
|
||||
|
||||
@@ -75,6 +75,5 @@ class SimpleDMatrix : public DMatrix {
|
||||
// Context used only for DMatrix initialization.
|
||||
Context fmat_ctx_;
|
||||
};
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
#endif // XGBOOST_DATA_SIMPLE_DMATRIX_H_
|
||||
|
||||
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
|
||||
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
|
||||
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
|
||||
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
|
||||
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
|
||||
ctx->Device());
|
||||
} else {
|
||||
CHECK(sparse_page_source_);
|
||||
ellpack_page_source_->Reset();
|
||||
|
||||
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
|
||||
} // namespace detail
|
||||
|
||||
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
|
||||
auto device = proxy->DeviceIdx();
|
||||
if (device < 0) {
|
||||
device = dh::CurrentDevice();
|
||||
auto device = proxy->Device();
|
||||
if (device.IsCPU()) {
|
||||
device = DeviceOrd::CUDA(dh::CurrentDevice());
|
||||
}
|
||||
CHECK_GE(device, 0);
|
||||
CHECK(device.IsCUDA());
|
||||
|
||||
cuda_impl::Dispatch(proxy,
|
||||
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
|
||||
|
||||
@@ -177,15 +177,15 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
|
||||
}
|
||||
// An heuristic for number of pre-fetched batches. We can make it part of BatchParam
|
||||
// to let user adjust number of pre-fetched batches when needed.
|
||||
uint32_t constexpr kPreFetch = 3;
|
||||
|
||||
size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
|
||||
std::int32_t n_prefetches = std::max(nthreads_, 3);
|
||||
std::int32_t n_prefetch_batches =
|
||||
std::min(static_cast<std::uint32_t>(n_prefetches), n_batches_);
|
||||
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
|
||||
std::size_t fetch_it = count_;
|
||||
|
||||
exce_.Rethrow();
|
||||
|
||||
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||
for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
|
||||
fetch_it %= n_batches_; // ring
|
||||
if (ring_->at(fetch_it).valid()) {
|
||||
continue;
|
||||
|
||||
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
|
||||
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
|
||||
monitor_.Start("BoostNewTrees");
|
||||
|
||||
predt->predictions.SetDevice(ctx_->Ordinal());
|
||||
predt->predictions.SetDevice(ctx_->Device());
|
||||
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
|
||||
model_.learner_model_param->OutputLength());
|
||||
CHECK_NE(n_groups, 0);
|
||||
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
|
||||
} else {
|
||||
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
|
||||
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
|
||||
ctx_->Ordinal()};
|
||||
ctx_->Device()};
|
||||
bool update_predict = true;
|
||||
for (bst_target_t gid = 0; gid < n_groups; ++gid) {
|
||||
node_position.clear();
|
||||
@@ -736,7 +736,7 @@ class Dart : public GBTree {
|
||||
|
||||
PredictionCacheEntry predts; // temporary storage for prediction
|
||||
if (ctx_->IsCUDA()) {
|
||||
predts.predictions.SetDevice(ctx_->gpu_id);
|
||||
predts.predictions.SetDevice(ctx_->Device());
|
||||
}
|
||||
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
||||
// multi-target is not yet supported.
|
||||
@@ -761,8 +761,8 @@ class Dart : public GBTree {
|
||||
CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
|
||||
|
||||
size_t n_rows = p_fmat->Info().num_row_;
|
||||
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
|
||||
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
|
||||
if (predts.predictions.Device().IsCUDA()) {
|
||||
p_out_preds->predictions.SetDevice(predts.predictions.Device());
|
||||
GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
|
||||
predts.predictions.DeviceSpan(), w, n_rows, n_groups,
|
||||
group);
|
||||
@@ -801,8 +801,8 @@ class Dart : public GBTree {
|
||||
|
||||
StringView msg{"Unsupported data type for inplace predict."};
|
||||
PredictionCacheEntry predts;
|
||||
if (ctx_->gpu_id != Context::kCpuId) {
|
||||
predts.predictions.SetDevice(ctx_->gpu_id);
|
||||
if (ctx_->IsCUDA()) {
|
||||
predts.predictions.SetDevice(ctx_->Device());
|
||||
}
|
||||
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
||||
|
||||
@@ -838,8 +838,8 @@ class Dart : public GBTree {
|
||||
CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
|
||||
|
||||
size_t n_rows = p_fmat->Info().num_row_;
|
||||
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
|
||||
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
|
||||
if (predts.predictions.Device().IsCUDA()) {
|
||||
p_out_preds->predictions.SetDevice(predts.predictions.Device());
|
||||
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
|
||||
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
|
||||
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
|
||||
|
||||
@@ -106,14 +106,30 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
|
||||
Validate(*this);
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::int32_t IOThreads(Context const* ctx) {
|
||||
CHECK(ctx);
|
||||
std::int32_t n_threads = ctx->Threads();
|
||||
// CRAN checks for number of threads used by examples, but we might not have the right
|
||||
// number of threads when serializing/unserializing models as nthread is a booster
|
||||
// parameter, which is only effective after booster initialization.
|
||||
//
|
||||
// The threshold ratio of CPU time to user time for R is 2.5, we set the number of
|
||||
// threads to 2.
|
||||
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
n_threads = std::min(2, n_threads);
|
||||
#endif
|
||||
return n_threads;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void GBTreeModel::SaveModel(Json* p_out) const {
|
||||
auto& out = *p_out;
|
||||
CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
|
||||
out["gbtree_model_param"] = ToJson(param);
|
||||
std::vector<Json> trees_json(trees.size());
|
||||
|
||||
CHECK(ctx_);
|
||||
common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
|
||||
common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
|
||||
auto const& tree = trees[t];
|
||||
Json jtree{Object{}};
|
||||
tree->SaveModel(&jtree);
|
||||
@@ -151,9 +167,7 @@ void GBTreeModel::LoadModel(Json const& in) {
|
||||
CHECK_EQ(tree_info_json.size(), param.num_trees);
|
||||
tree_info.resize(param.num_trees);
|
||||
|
||||
CHECK(ctx_);
|
||||
|
||||
common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
|
||||
common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
|
||||
auto tree_id = get<Integer const>(trees_json[t]["id"]);
|
||||
trees.at(tree_id).reset(new RegTree{});
|
||||
trees[tree_id]->LoadModel(trees_json[t]);
|
||||
|
||||
@@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
|
||||
std::swap(base_score_, base_margin);
|
||||
// Make sure read access everywhere for thread-safe prediction.
|
||||
std::as_const(base_score_).HostView();
|
||||
if (!ctx->IsCPU()) {
|
||||
if (ctx->IsCUDA()) {
|
||||
std::as_const(base_score_).View(ctx->Device());
|
||||
}
|
||||
CHECK(std::as_const(base_score_).Data()->HostCanRead());
|
||||
@@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
|
||||
linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
|
||||
// multi-class is not yet supported.
|
||||
CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
|
||||
if (device.IsCPU()) {
|
||||
if (!device.IsCUDA()) {
|
||||
// Make sure that we won't run into race condition.
|
||||
CHECK(base_score_.Data()->HostCanRead());
|
||||
return base_score_.HostView();
|
||||
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
|
||||
|
||||
void LearnerModelParam::Copy(LearnerModelParam const& that) {
|
||||
base_score_.Reshape(that.base_score_.Shape());
|
||||
base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
|
||||
base_score_.Data()->SetDevice(that.base_score_.Device());
|
||||
base_score_.Data()->Copy(*that.base_score_.Data());
|
||||
std::as_const(base_score_).HostView();
|
||||
if (that.base_score_.DeviceIdx() != Context::kCpuId) {
|
||||
if (!that.base_score_.Device().IsCPU()) {
|
||||
std::as_const(base_score_).View(that.base_score_.Device());
|
||||
}
|
||||
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
|
||||
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
|
||||
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
|
||||
if (p_fmat) {
|
||||
auto const& info = p_fmat->Info();
|
||||
info.Validate(Ctx()->Ordinal());
|
||||
info.Validate(Ctx()->Device());
|
||||
// We estimate it from input data.
|
||||
linalg::Tensor<float, 1> base_score;
|
||||
InitEstimation(info, &base_score);
|
||||
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
|
||||
monitor_.Init("Learner");
|
||||
for (std::shared_ptr<DMatrix> const& d : cache) {
|
||||
if (d) {
|
||||
prediction_container_.Cache(d, Context::kCpuId);
|
||||
prediction_container_.Cache(d, DeviceOrd::CPU());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1057,7 +1057,7 @@ class LearnerIO : public LearnerConfiguration {
|
||||
? std::numeric_limits<float>::quiet_NaN()
|
||||
: obj_->ProbToMargin(mparam_.base_score)},
|
||||
{1},
|
||||
Context::kCpuId},
|
||||
DeviceOrd::CPU()},
|
||||
obj_->Task(), tparam_.multi_strategy);
|
||||
|
||||
if (attributes_.find("objective") != attributes_.cend()) {
|
||||
@@ -1282,7 +1282,7 @@ class LearnerImpl : public LearnerIO {
|
||||
|
||||
this->ValidateDMatrix(train.get(), true);
|
||||
|
||||
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
|
||||
auto& predt = prediction_container_.Cache(train, ctx_.Device());
|
||||
|
||||
monitor_.Start("PredictRaw");
|
||||
this->PredictRaw(train.get(), &predt, true, 0, 0);
|
||||
@@ -1312,7 +1312,7 @@ class LearnerImpl : public LearnerIO {
|
||||
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
|
||||
<< "The number of columns in gradient should be equal to the number of targets/classes in "
|
||||
"the model.";
|
||||
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
|
||||
auto& predt = prediction_container_.Cache(train, ctx_.Device());
|
||||
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
|
||||
monitor_.Stop("BoostOneIter");
|
||||
}
|
||||
@@ -1330,17 +1330,19 @@ class LearnerImpl : public LearnerIO {
|
||||
if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
|
||||
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
|
||||
auto config = obj_->DefaultMetricConfig();
|
||||
metrics_.back()->LoadConfig(config);
|
||||
if (!IsA<Null>(config)) {
|
||||
metrics_.back()->LoadConfig(config);
|
||||
}
|
||||
metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < data_sets.size(); ++i) {
|
||||
std::shared_ptr<DMatrix> m = data_sets[i];
|
||||
auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
|
||||
auto &predt = prediction_container_.Cache(m, ctx_.Device());
|
||||
this->ValidateDMatrix(m.get(), false);
|
||||
this->PredictRaw(m.get(), &predt, false, 0, 0);
|
||||
|
||||
auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
|
||||
auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
|
||||
out.Resize(predt.predictions.Size());
|
||||
out.Copy(predt.predictions);
|
||||
|
||||
@@ -1376,7 +1378,7 @@ class LearnerImpl : public LearnerIO {
|
||||
} else if (pred_leaf) {
|
||||
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
|
||||
} else {
|
||||
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
|
||||
auto& prediction = prediction_container_.Cache(data, ctx_.Device());
|
||||
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
|
||||
// Copy the prediction cache to output prediction. out_preds comes from C API
|
||||
out_preds->SetDevice(ctx_.Device());
|
||||
@@ -1456,7 +1458,7 @@ class LearnerImpl : public LearnerIO {
|
||||
|
||||
void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
|
||||
MetaInfo const& info = p_fmat->Info();
|
||||
info.Validate(ctx_.gpu_id);
|
||||
info.Validate(ctx_.Device());
|
||||
|
||||
if (is_training) {
|
||||
CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
|
||||
|
||||
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
}
|
||||
|
||||
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
|
||||
if (ctx_->gpu_id < 0) return;
|
||||
if (ctx_->IsCPU()) return;
|
||||
|
||||
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
|
||||
|
||||
@@ -60,8 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
return;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
// The begin and end indices for the section of each column associated with
|
||||
// this device
|
||||
std::vector<std::pair<bst_uint, bst_uint>> column_segments;
|
||||
@@ -135,7 +134,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
++group_idx) {
|
||||
// Get gradient
|
||||
auto grad = GradientPair(0, 0);
|
||||
if (ctx_->gpu_id >= 0) {
|
||||
if (ctx_->IsCUDA()) {
|
||||
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
|
||||
}
|
||||
auto dbias = static_cast<float>(
|
||||
@@ -144,7 +143,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
model->Bias()[group_idx] += dbias;
|
||||
|
||||
// Update residual
|
||||
if (ctx_->gpu_id >= 0) {
|
||||
if (ctx_->IsCUDA()) {
|
||||
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
|
||||
}
|
||||
}
|
||||
@@ -155,7 +154,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
bst_float &w = (*model)[fidx][group_idx];
|
||||
// Get gradient
|
||||
auto grad = GradientPair(0, 0);
|
||||
if (ctx_->gpu_id >= 0) {
|
||||
if (ctx_->IsCUDA()) {
|
||||
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
|
||||
}
|
||||
auto dw = static_cast<float>(tparam_.learning_rate *
|
||||
@@ -164,15 +163,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
tparam_.reg_lambda_denorm));
|
||||
w += dw;
|
||||
|
||||
if (ctx_->gpu_id >= 0) {
|
||||
if (ctx_->IsCUDA()) {
|
||||
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
|
||||
}
|
||||
}
|
||||
|
||||
// This needs to be public because of the __device__ lambda.
|
||||
GradientPair GetBiasGradient(int group_idx, int num_group) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
auto counting = thrust::make_counting_iterator(0ull);
|
||||
auto f = [=] __device__(size_t idx) {
|
||||
return idx * num_group + group_idx;
|
||||
@@ -196,8 +194,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
||||
|
||||
// This needs to be public because of the __device__ lambda.
|
||||
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
|
||||
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
|
||||
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
|
||||
|
||||
@@ -23,8 +23,7 @@
|
||||
#include "xgboost/linalg.h"
|
||||
#include "xgboost/metric.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
namespace xgboost::metric {
|
||||
// tag the this file, used by force static link later.
|
||||
DMLC_REGISTRY_FILE_TAG(auc);
|
||||
/**
|
||||
@@ -257,10 +256,10 @@ template <typename Curve>
|
||||
class EvalAUC : public MetricNoCache {
|
||||
double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
|
||||
double auc {0};
|
||||
if (ctx_->gpu_id != Context::kCpuId) {
|
||||
preds.SetDevice(ctx_->gpu_id);
|
||||
info.labels.SetDevice(ctx_->gpu_id);
|
||||
info.weights_.SetDevice(ctx_->gpu_id);
|
||||
if (ctx_->Device().IsCUDA()) {
|
||||
preds.SetDevice(ctx_->Device());
|
||||
info.labels.SetDevice(ctx_->Device());
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
}
|
||||
// We use the global size to handle empty dataset.
|
||||
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
|
||||
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
||||
double auc{0};
|
||||
uint32_t valid_groups = 0;
|
||||
auto n_threads = ctx_->Threads();
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
std::tie(auc, valid_groups) =
|
||||
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
|
||||
} else {
|
||||
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
||||
double auc{0};
|
||||
auto n_threads = ctx_->Threads();
|
||||
CHECK_NE(n_classes, 0);
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
|
||||
} else {
|
||||
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
|
||||
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
||||
std::tuple<double, double, double>
|
||||
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
||||
double fp, tp, auc;
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
|
||||
info.labels.HostView().Slice(linalg::All(), 0),
|
||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
||||
}
|
||||
|
||||
public:
|
||||
char const* Name() const override {
|
||||
[[nodiscard]] char const* Name() const override {
|
||||
return "auc";
|
||||
}
|
||||
};
|
||||
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||
std::tuple<double, double, double>
|
||||
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
||||
double pr, re, auc;
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
std::tie(pr, re, auc) =
|
||||
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
|
||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||
|
||||
double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
|
||||
size_t n_classes) {
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
auto n_threads = this->ctx_->Threads();
|
||||
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
|
||||
} else {
|
||||
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||
double auc{0};
|
||||
uint32_t valid_groups = 0;
|
||||
auto n_threads = ctx_->Threads();
|
||||
if (ctx_->gpu_id == Context::kCpuId) {
|
||||
if (ctx_->IsCPU()) {
|
||||
auto labels = info.labels.Data()->ConstHostSpan();
|
||||
if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
|
||||
InvalidLabels();
|
||||
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
||||
}
|
||||
|
||||
public:
|
||||
const char *Name() const override { return "aucpr"; }
|
||||
[[nodiscard]] const char *Name() const override { return "aucpr"; }
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
|
||||
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
|
||||
return {};
|
||||
}
|
||||
#endif
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::metric
|
||||
|
||||
@@ -926,8 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
||||
common::Span<float const> predts,
|
||||
MetaInfo const &info,
|
||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||
if (predts.empty()) {
|
||||
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
||||
}
|
||||
|
||||
@@ -46,7 +46,26 @@ template <typename Fn>
|
||||
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
|
||||
PackedReduceResult result;
|
||||
auto labels = info.labels.View(ctx->Device());
|
||||
if (ctx->IsCPU()) {
|
||||
if (ctx->IsCUDA()) {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::counting_iterator<size_t> begin(0);
|
||||
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
||||
result = thrust::transform_reduce(
|
||||
thrust::cuda::par(alloc), begin, end,
|
||||
[=] XGBOOST_DEVICE(size_t i) {
|
||||
auto idx = linalg::UnravelIndex(i, labels.Shape());
|
||||
auto sample_id = std::get<0>(idx);
|
||||
auto target_id = std::get<1>(idx);
|
||||
auto res = loss(i, sample_id, target_id);
|
||||
float v{std::get<0>(res)}, wt{std::get<1>(res)};
|
||||
return PackedReduceResult{v, wt};
|
||||
},
|
||||
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
} else {
|
||||
auto n_threads = ctx->Threads();
|
||||
std::vector<double> score_tloc(n_threads, 0.0);
|
||||
std::vector<double> weight_tloc(n_threads, 0.0);
|
||||
@@ -69,41 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
|
||||
double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
|
||||
double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
|
||||
result = PackedReduceResult{residue_sum, weights_sum};
|
||||
} else {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::counting_iterator<size_t> begin(0);
|
||||
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
||||
result = thrust::transform_reduce(
|
||||
thrust::cuda::par(alloc), begin, end,
|
||||
[=] XGBOOST_DEVICE(size_t i) {
|
||||
auto idx = linalg::UnravelIndex(i, labels.Shape());
|
||||
auto sample_id = std::get<0>(idx);
|
||||
auto target_id = std::get<1>(idx);
|
||||
auto res = loss(i, sample_id, target_id);
|
||||
float v{std::get<0>(res)}, wt{std::get<1>(res)};
|
||||
return PackedReduceResult{v, wt};
|
||||
},
|
||||
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
thrust::counting_iterator<size_t> begin(0);
|
||||
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
||||
|
||||
result = thrust::transform_reduce(
|
||||
thrust::hip::par(alloc), begin, end,
|
||||
[=] XGBOOST_DEVICE(size_t i) {
|
||||
auto idx = linalg::UnravelIndex(i, labels.Shape());
|
||||
auto sample_id = std::get<0>(idx);
|
||||
auto target_id = std::get<1>(idx);
|
||||
auto res = loss(i, sample_id, target_id);
|
||||
float v{std::get<0>(res)}, wt{std::get<1>(res)};
|
||||
return PackedReduceResult{v, wt};
|
||||
},
|
||||
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -201,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache {
|
||||
CHECK_EQ(info.labels.Shape(0), info.num_row_);
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
|
||||
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||
: info.weights_.ConstDeviceSpan());
|
||||
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan());
|
||||
float slope = this->param_.huber_slope;
|
||||
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
|
||||
PackedReduceResult result =
|
||||
@@ -367,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache {
|
||||
}
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||
: info.weights_.ConstDeviceSpan());
|
||||
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan());
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
|
||||
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
|
||||
|
||||
auto d_policy = policy_;
|
||||
auto result =
|
||||
|
||||
@@ -149,24 +149,24 @@ class MultiClassMetricsReduction {
|
||||
|
||||
#endif // XGBOOST_USE_CUDA || defined(XGBOOST_USE_HIP)
|
||||
|
||||
PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
|
||||
PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
|
||||
const HostDeviceVector<bst_float>& weights,
|
||||
const HostDeviceVector<bst_float>& labels,
|
||||
const HostDeviceVector<bst_float>& preds) {
|
||||
PackedReduceResult result;
|
||||
|
||||
if (device < 0) {
|
||||
if (device.IsCPU()) {
|
||||
result =
|
||||
CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
|
||||
CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
else { // NOLINT
|
||||
device_ = tparam.gpu_id;
|
||||
device_ = ctx.Device();
|
||||
preds.SetDevice(device_);
|
||||
labels.SetDevice(device_);
|
||||
weights.SetDevice(device_);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
result = DeviceReduceMetrics(weights, labels, preds, n_class);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
@@ -176,8 +176,8 @@ class MultiClassMetricsReduction {
|
||||
private:
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
dh::PinnedMemory label_error_;
|
||||
int device_{-1};
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -198,7 +198,7 @@ struct EvalMClassBase : public MetricNoCache {
|
||||
CHECK_GE(nclass, 1U)
|
||||
<< "mlogloss and merror are only used for multi-class classification,"
|
||||
<< " use logloss for binary classification";
|
||||
int device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
auto result =
|
||||
reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
|
||||
dat[0] = result.Residue();
|
||||
|
||||
@@ -41,7 +41,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
|
||||
auto d_gptr = p_cache->DataGroupPtr(ctx);
|
||||
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
||||
|
||||
predt.SetDevice(ctx->gpu_id);
|
||||
predt.SetDevice(ctx->Device());
|
||||
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
|
||||
auto topk = p_cache->Param().TopK();
|
||||
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
|
||||
@@ -96,7 +96,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
|
||||
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
|
||||
}
|
||||
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
||||
predt.SetDevice(ctx->gpu_id);
|
||||
predt.SetDevice(ctx->Device());
|
||||
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
|
||||
|
||||
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
|
||||
|
||||
@@ -148,19 +148,18 @@ class ElementWiseSurvivalMetricsReduction {
|
||||
const HostDeviceVector<bst_float>& preds) {
|
||||
PackedReduceResult result;
|
||||
|
||||
if (ctx.gpu_id < 0) {
|
||||
if (ctx.IsCPU()) {
|
||||
result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
|
||||
preds, ctx.Threads());
|
||||
}
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
else { // NOLINT
|
||||
preds.SetDevice(ctx.gpu_id);
|
||||
labels_lower_bound.SetDevice(ctx.gpu_id);
|
||||
labels_upper_bound.SetDevice(ctx.gpu_id);
|
||||
weights.SetDevice(ctx.gpu_id);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
|
||||
preds.SetDevice(ctx.Device());
|
||||
labels_lower_bound.SetDevice(ctx.Device());
|
||||
labels_upper_bound.SetDevice(ctx.Device());
|
||||
weights.SetDevice(ctx.Device());
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
|
||||
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
|
||||
@@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
|
||||
inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
|
||||
std::int32_t group_idx, MetaInfo const& info, float learning_rate,
|
||||
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
|
||||
if (ctx->IsCPU()) {
|
||||
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
|
||||
predt, alpha, p_tree);
|
||||
} else {
|
||||
position.SetDevice(ctx->gpu_id);
|
||||
if (ctx->IsCUDA()) {
|
||||
position.SetDevice(ctx->Device());
|
||||
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
|
||||
predt, alpha, p_tree);
|
||||
} else {
|
||||
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
|
||||
predt, alpha, p_tree);
|
||||
}
|
||||
}
|
||||
} // namespace obj
|
||||
|
||||
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
|
||||
|
||||
template <typename Distribution>
|
||||
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
|
||||
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
|
||||
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
|
||||
bool is_null_weight, float aft_loss_distribution_scale) {
|
||||
common::Transform<>::Init(
|
||||
[=] XGBOOST_DEVICE(size_t _idx,
|
||||
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
|
||||
CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(ndata, 1);
|
||||
const int device = ctx_->gpu_id;
|
||||
const auto device = ctx_->Device();
|
||||
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
|
||||
const bool is_null_weight = info.weights_.Size() == 0;
|
||||
if (!is_null_weight) {
|
||||
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
|
||||
_preds[_idx] = exp(_preds[_idx]);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2018-2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2018-2023, XGBoost Contributors
|
||||
* \file hinge.cc
|
||||
* \brief Provides an implementation of the hinge loss function
|
||||
* \author Henry Gouk
|
||||
@@ -13,8 +13,7 @@
|
||||
#include "../common/transform.h"
|
||||
#include "../common/common.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
namespace xgboost::obj {
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
|
||||
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
|
||||
_out_gpair[_idx] = GradientPair(g, h);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
|
||||
ctx_->gpu_id).Eval(
|
||||
ctx_->Device()).Eval(
|
||||
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
|
||||
}
|
||||
|
||||
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
|
||||
_preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
|
||||
const char* DefaultEvalMetric() const override {
|
||||
[[nodiscard]] const char* DefaultEvalMetric() const override {
|
||||
return "error";
|
||||
}
|
||||
|
||||
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
|
||||
.describe("Hinge loss. Expects labels to be in [0,1f]")
|
||||
.set_body([]() { return new HingeObj(); });
|
||||
|
||||
} // namespace obj
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::obj
|
||||
|
||||
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
|
||||
CheckInitInputs(info);
|
||||
}
|
||||
// Avoid altering any state in child objective.
|
||||
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
|
||||
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
|
||||
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
|
||||
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
|
||||
|
||||
Json config{Object{}};
|
||||
this->SaveConfig(&config);
|
||||
|
||||
@@ -103,19 +103,19 @@ class LambdaRankObj : public FitIntercept {
|
||||
|
||||
// Update position biased for unbiased click data
|
||||
void UpdatePositionBias() {
|
||||
li_full_.SetDevice(ctx_->gpu_id);
|
||||
lj_full_.SetDevice(ctx_->gpu_id);
|
||||
li_.SetDevice(ctx_->gpu_id);
|
||||
lj_.SetDevice(ctx_->gpu_id);
|
||||
li_full_.SetDevice(ctx_->Device());
|
||||
lj_full_.SetDevice(ctx_->Device());
|
||||
li_.SetDevice(ctx_->Device());
|
||||
lj_.SetDevice(ctx_->Device());
|
||||
|
||||
if (ctx_->IsCPU()) {
|
||||
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
|
||||
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
|
||||
&li_, &lj_, p_cache_);
|
||||
} else {
|
||||
if (ctx_->IsCUDA()) {
|
||||
cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
|
||||
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
|
||||
&li_, &lj_, p_cache_);
|
||||
} else {
|
||||
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
|
||||
lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
|
||||
&li_, &lj_, p_cache_);
|
||||
}
|
||||
|
||||
li_full_.Data()->Fill(0.0);
|
||||
|
||||
@@ -296,12 +296,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
|
||||
linalg::VectorView<double> li, linalg::VectorView<double> lj,
|
||||
linalg::Matrix<GradientPair>* out_gpair) {
|
||||
// boilerplate
|
||||
std::int32_t device_id = ctx->gpu_id;
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
auto device = ctx->Device();
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
auto n_groups = p_cache->Groups();
|
||||
|
||||
info.labels.SetDevice(device_id);
|
||||
preds.SetDevice(device_id);
|
||||
info.labels.SetDevice(device);
|
||||
preds.SetDevice(device);
|
||||
out_gpair->SetDevice(ctx->Device());
|
||||
out_gpair->Reshape(preds.Size(), 1);
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
||||
const int nclass = param_.num_class;
|
||||
const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
|
||||
|
||||
auto device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
out_gpair->SetDevice(device);
|
||||
info.labels.SetDevice(device);
|
||||
info.weights_.SetDevice(device);
|
||||
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
||||
const int nclass = param_.num_class;
|
||||
const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
|
||||
|
||||
auto device = io_preds->DeviceIdx();
|
||||
auto device = io_preds->Device();
|
||||
if (prob) {
|
||||
common::Transform<>::Init(
|
||||
[=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
|
||||
|
||||
@@ -70,16 +70,16 @@ class QuantileRegression : public ObjFunction {
|
||||
out_gpair->Reshape(info.num_row_, n_targets);
|
||||
auto gpair = out_gpair->View(ctx_->Device());
|
||||
|
||||
info.weights_.SetDevice(ctx_->gpu_id);
|
||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||
: info.weights_.ConstDeviceSpan()};
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan()};
|
||||
|
||||
preds.SetDevice(ctx_->gpu_id);
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predt = linalg::MakeVec(&preds);
|
||||
auto n_samples = info.num_row_;
|
||||
|
||||
alpha_.SetDevice(ctx_->gpu_id);
|
||||
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
|
||||
alpha_.SetDevice(ctx_->Device());
|
||||
auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
|
||||
|
||||
linalg::ElementWiseKernel(
|
||||
ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
|
||||
@@ -103,11 +103,48 @@ class QuantileRegression : public ObjFunction {
|
||||
CHECK(!alpha_.Empty());
|
||||
|
||||
auto n_targets = this->Targets(info);
|
||||
base_score->SetDevice(ctx_->gpu_id);
|
||||
base_score->SetDevice(ctx_->Device());
|
||||
base_score->Reshape(n_targets);
|
||||
|
||||
double sw{0};
|
||||
if (ctx_->IsCPU()) {
|
||||
if (ctx_->IsCUDA()) {
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
alpha_.SetDevice(ctx_->Device());
|
||||
auto d_alpha = alpha_.ConstDeviceSpan();
|
||||
auto d_labels = info.labels.View(ctx_->Device());
|
||||
auto seg_it = dh::MakeTransformIterator<std::size_t>(
|
||||
thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
|
||||
CHECK_EQ(d_labels.Shape(1), 1);
|
||||
auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) {
|
||||
auto sample_idx = i % d_labels.Shape(0);
|
||||
return d_labels(sample_idx, 0);
|
||||
});
|
||||
auto n = d_labels.Size() * d_alpha.size();
|
||||
CHECK_EQ(base_score->Size(), d_alpha.size());
|
||||
if (info.weights_.Empty()) {
|
||||
common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
|
||||
val_it + n, base_score->Data());
|
||||
sw = info.num_row_;
|
||||
} else {
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
auto d_weights = info.weights_.ConstDeviceSpan();
|
||||
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) {
|
||||
auto sample_idx = i % d_labels.Shape(0);
|
||||
return d_weights[sample_idx];
|
||||
});
|
||||
common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
|
||||
val_it, val_it + n, weight_it, weight_it + n,
|
||||
base_score->Data());
|
||||
sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
|
||||
thrust::plus<double>{});
|
||||
}
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
} else {
|
||||
auto quantiles = base_score->HostView();
|
||||
auto h_weights = info.weights_.ConstHostVector();
|
||||
if (info.weights_.Empty()) {
|
||||
@@ -127,43 +164,6 @@ class QuantileRegression : public ObjFunction {
|
||||
linalg::cend(h_labels), std::cbegin(h_weights));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
alpha_.SetDevice(ctx_->gpu_id);
|
||||
auto d_alpha = alpha_.ConstDeviceSpan();
|
||||
auto d_labels = info.labels.View(ctx_->Device());
|
||||
auto seg_it = dh::MakeTransformIterator<std::size_t>(
|
||||
thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
|
||||
CHECK_EQ(d_labels.Shape(1), 1);
|
||||
auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) {
|
||||
auto sample_idx = i % d_labels.Shape(0);
|
||||
return d_labels(sample_idx, 0);
|
||||
});
|
||||
auto n = d_labels.Size() * d_alpha.size();
|
||||
CHECK_EQ(base_score->Size(), d_alpha.size());
|
||||
if (info.weights_.Empty()) {
|
||||
common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
|
||||
val_it + n, base_score->Data());
|
||||
sw = info.num_row_;
|
||||
} else {
|
||||
info.weights_.SetDevice(ctx_->gpu_id);
|
||||
auto d_weights = info.weights_.ConstDeviceSpan();
|
||||
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) {
|
||||
auto sample_idx = i % d_labels.Shape(0);
|
||||
return d_weights[sample_idx];
|
||||
});
|
||||
common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
|
||||
val_it, val_it + n, weight_it, weight_it + n,
|
||||
base_score->Data());
|
||||
sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
|
||||
thrust::plus<double>{});
|
||||
}
|
||||
#else
|
||||
common::AssertGPUSupport();
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
}
|
||||
|
||||
// For multiple quantiles, we should extend the base score to a vector instead of
|
||||
|
||||
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
|
||||
|
||||
size_t const ndata = preds.Size();
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
auto device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
|
||||
bool is_null_weight = info.weights_.Size() == 0;
|
||||
auto scale_pos_weight = param_.scale_pos_weight;
|
||||
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
|
||||
additional_input_.HostVector().begin()[1] = is_null_weight;
|
||||
|
||||
const size_t nthreads = ctx_->Threads();
|
||||
bool on_device = device >= 0;
|
||||
bool on_device = device.IsCUDA();
|
||||
// On CPU we run the transformation each thread processing a contigious block of data
|
||||
// for better performance.
|
||||
const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
|
||||
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
|
||||
_preds[_idx] = Loss::PredTransform(_preds[_idx]);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
|
||||
@@ -246,16 +246,16 @@ class PseudoHuberRegression : public FitIntercept {
|
||||
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
|
||||
out_gpair->SetDevice(ctx_->gpu_id);
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
auto gpair = out_gpair->View(ctx_->Device());
|
||||
|
||||
preds.SetDevice(ctx_->gpu_id);
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predt = linalg::MakeVec(&preds);
|
||||
|
||||
info.weights_.SetDevice(ctx_->gpu_id);
|
||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||
: info.weights_.ConstDeviceSpan()};
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan()};
|
||||
|
||||
linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
|
||||
auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
|
||||
@@ -287,6 +287,13 @@ class PseudoHuberRegression : public FitIntercept {
|
||||
}
|
||||
FromJson(in["pseudo_huber_param"], ¶m_);
|
||||
}
|
||||
[[nodiscard]] Json DefaultMetricConfig() const override {
|
||||
CHECK(param_.GetInitialised());
|
||||
Json config{Object{}};
|
||||
config["name"] = String{this->DefaultEvalMetric()};
|
||||
config["pseudo_huber_param"] = ToJson(param_);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
|
||||
@@ -320,7 +327,7 @@ class PoissonRegression : public FitIntercept {
|
||||
size_t const ndata = preds.Size();
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
auto device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
label_correct_.Resize(1);
|
||||
label_correct_.Fill(1);
|
||||
|
||||
@@ -362,7 +369,7 @@ class PoissonRegression : public FitIntercept {
|
||||
_preds[_idx] = expf(_preds[_idx]);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||
@@ -505,7 +512,7 @@ class GammaRegression : public FitIntercept {
|
||||
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
|
||||
CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
|
||||
const size_t ndata = preds.Size();
|
||||
auto device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
label_correct_.Resize(1);
|
||||
@@ -548,7 +555,7 @@ class GammaRegression : public FitIntercept {
|
||||
_preds[_idx] = expf(_preds[_idx]);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||
@@ -606,7 +613,7 @@ class TweedieRegression : public FitIntercept {
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
|
||||
auto device = ctx_->gpu_id;
|
||||
auto device = ctx_->Device();
|
||||
label_correct_.Resize(1);
|
||||
label_correct_.Fill(1);
|
||||
|
||||
@@ -653,7 +660,7 @@ class TweedieRegression : public FitIntercept {
|
||||
_preds[_idx] = expf(_preds[_idx]);
|
||||
},
|
||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||
io_preds->DeviceIdx())
|
||||
io_preds->Device())
|
||||
.Eval(io_preds);
|
||||
}
|
||||
|
||||
@@ -704,11 +711,11 @@ class MeanAbsoluteError : public ObjFunction {
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
auto gpair = out_gpair->View(ctx_->Device());
|
||||
|
||||
preds.SetDevice(ctx_->gpu_id);
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predt = linalg::MakeVec(&preds);
|
||||
info.weights_.SetDevice(ctx_->gpu_id);
|
||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||
: info.weights_.ConstDeviceSpan()};
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan()};
|
||||
|
||||
linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
|
||||
auto sign = [](auto x) {
|
||||
|
||||
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
|
||||
|
||||
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
|
||||
bst_feature_t num_features, bst_row_t num_rows,
|
||||
size_t entry_start, float missing) :
|
||||
batch{batch},
|
||||
columns{num_features},
|
||||
use_shared{use_shared},
|
||||
is_valid{missing} {
|
||||
extern __shared__ float _smem[];
|
||||
smem = _smem;
|
||||
if (use_shared) {
|
||||
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
size_t shared_elements = blockDim.x * num_features;
|
||||
dh::BlockFill(smem, shared_elements, nanf(""));
|
||||
__syncthreads();
|
||||
if (global_idx < num_rows) {
|
||||
auto beg = global_idx * columns;
|
||||
auto end = (global_idx + 1) * columns;
|
||||
for (size_t i = beg; i < end; ++i) {
|
||||
auto value = batch.GetElement(i).value;
|
||||
if (is_valid(value)) {
|
||||
smem[threadIdx.x * num_features + (i - beg)] = value;
|
||||
}
|
||||
size_t entry_start, float missing)
|
||||
: batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
|
||||
extern __shared__ float _smem[];
|
||||
smem = _smem;
|
||||
if (use_shared) {
|
||||
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
size_t shared_elements = blockDim.x * num_features;
|
||||
dh::BlockFill(smem, shared_elements, nanf(""));
|
||||
__syncthreads();
|
||||
if (global_idx < num_rows) {
|
||||
auto beg = global_idx * columns;
|
||||
auto end = (global_idx + 1) * columns;
|
||||
for (size_t i = beg; i < end; ++i) {
|
||||
auto value = batch.GetElement(i).value;
|
||||
if (is_valid(value)) {
|
||||
smem[threadIdx.x * num_features + (i - beg)] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
|
||||
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
|
||||
if (use_shared) {
|
||||
return smem[threadIdx.x * columns + fidx];
|
||||
}
|
||||
@@ -340,11 +337,11 @@ class DeviceModel {
|
||||
size_t tree_end_; // NOLINT
|
||||
int num_group;
|
||||
|
||||
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
|
||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
|
||||
// Copy decision trees to device
|
||||
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
|
||||
tree_segments = HostDeviceVector<size_t>({}, device);
|
||||
auto& h_tree_segments = tree_segments.HostVector();
|
||||
h_tree_segments.reserve((tree_end - tree_begin) + 1);
|
||||
size_t sum = 0;
|
||||
@@ -354,8 +351,8 @@ class DeviceModel {
|
||||
h_tree_segments.push_back(sum);
|
||||
}
|
||||
|
||||
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
|
||||
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
|
||||
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
|
||||
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
|
||||
auto d_nodes = nodes.DevicePointer();
|
||||
auto d_stats = stats.DevicePointer();
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
@@ -370,12 +367,12 @@ class DeviceModel {
|
||||
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
|
||||
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
|
||||
auto& h_tree_group = tree_group.HostVector();
|
||||
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
|
||||
|
||||
// Initialize categorical splits.
|
||||
split_types.SetDevice(gpu_id);
|
||||
split_types.SetDevice(device);
|
||||
std::vector<FeatureType>& h_split_types = split_types.HostVector();
|
||||
h_split_types.resize(h_tree_segments.back());
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||
@@ -384,8 +381,8 @@ class DeviceModel {
|
||||
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
|
||||
}
|
||||
|
||||
categories = HostDeviceVector<uint32_t>({}, gpu_id);
|
||||
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
|
||||
categories = HostDeviceVector<uint32_t>({}, device);
|
||||
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
|
||||
std::vector<uint32_t> &h_categories = categories.HostVector();
|
||||
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||
@@ -398,7 +395,7 @@ class DeviceModel {
|
||||
}
|
||||
|
||||
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
|
||||
h_tree_segments.back(), {}, gpu_id);
|
||||
h_tree_segments.back(), {}, device);
|
||||
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
|
||||
categories_node_segments.HostVector();
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||
@@ -490,8 +487,8 @@ struct PathInfo {
|
||||
void ExtractPaths(
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
||||
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
||||
int gpu_id) {
|
||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||
DeviceOrd device) {
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
auto& device_model = *model;
|
||||
|
||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||
@@ -654,11 +651,12 @@ __global__ void MaskBitVectorKernel(
|
||||
common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
|
||||
std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
|
||||
std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
|
||||
// This needs to be always instantiated since the data is loaded cooperatively by all threads.
|
||||
SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
|
||||
auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (row_idx >= num_rows) {
|
||||
return;
|
||||
}
|
||||
SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
|
||||
|
||||
std::size_t tree_offset = 0;
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
@@ -689,10 +687,10 @@ __global__ void MaskBitVectorKernel(
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
|
||||
BitVector const& decision_bits,
|
||||
BitVector const& missing_bits, std::size_t num_nodes,
|
||||
std::size_t tree_offset) {
|
||||
__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
|
||||
BitVector const& decision_bits,
|
||||
BitVector const& missing_bits, std::size_t num_nodes,
|
||||
std::size_t tree_offset) {
|
||||
bst_node_t nidx = 0;
|
||||
RegTree::Node n = tree.d_tree[nidx];
|
||||
while (!n.IsLeaf()) {
|
||||
@@ -704,9 +702,19 @@ __device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
|
||||
}
|
||||
n = tree.d_tree[nidx];
|
||||
}
|
||||
return nidx;
|
||||
}
|
||||
|
||||
__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
|
||||
BitVector const& decision_bits,
|
||||
BitVector const& missing_bits, std::size_t num_nodes,
|
||||
std::size_t tree_offset) {
|
||||
auto const nidx =
|
||||
GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
|
||||
return tree.d_tree[nidx].LeafValue();
|
||||
}
|
||||
|
||||
template <bool predict_leaf>
|
||||
__global__ void PredictByBitVectorKernel(
|
||||
common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
|
||||
common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
|
||||
@@ -722,27 +730,39 @@ __global__ void PredictByBitVectorKernel(
|
||||
}
|
||||
|
||||
std::size_t tree_offset = 0;
|
||||
if (num_group == 1) {
|
||||
float sum = 0;
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
if constexpr (predict_leaf) {
|
||||
for (size_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
|
||||
tree_offset);
|
||||
auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
|
||||
num_nodes, tree_offset);
|
||||
d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
}
|
||||
d_out_predictions[row_idx] += sum;
|
||||
} else {
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
auto const tree_group = d_tree_group[tree_idx];
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
bst_uint out_prediction_idx = row_idx * num_group + tree_group;
|
||||
d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
|
||||
row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
if (num_group == 1) {
|
||||
float sum = 0;
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
|
||||
tree_offset);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
}
|
||||
d_out_predictions[row_idx] += sum;
|
||||
} else {
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
auto const tree_group = d_tree_group[tree_idx];
|
||||
TreeView d_tree{tree_begin, tree_idx, d_nodes,
|
||||
d_tree_segments, d_tree_split_types, d_cat_tree_segments,
|
||||
d_cat_node_segments, d_categories};
|
||||
bst_uint out_prediction_idx = row_idx * num_group + tree_group;
|
||||
d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
|
||||
row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
|
||||
tree_offset += d_tree.d_tree.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -754,21 +774,29 @@ class ColumnSplitHelper {
|
||||
void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
|
||||
gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
|
||||
CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
|
||||
PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
|
||||
model.learner_model_param->num_output_group);
|
||||
PredictDMatrix<false>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
|
||||
model.learner_model_param->num_output_group);
|
||||
}
|
||||
|
||||
void PredictLeaf(DMatrix* dmat, HostDeviceVector<float>* out_preds, gbm::GBTreeModel const& model,
|
||||
DeviceModel const& d_model) const {
|
||||
CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
|
||||
PredictDMatrix<true>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
|
||||
model.learner_model_param->num_output_group);
|
||||
}
|
||||
|
||||
private:
|
||||
using BitType = BitVector::value_type;
|
||||
|
||||
template <bool predict_leaf>
|
||||
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
|
||||
bst_feature_t num_features, std::uint32_t num_group) const {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
dh::caching_device_vector<BitType> decision_storage{};
|
||||
dh::caching_device_vector<BitType> missing_storage{};
|
||||
|
||||
auto constexpr kBlockThreads = 128;
|
||||
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
|
||||
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
|
||||
auto const shared_memory_bytes =
|
||||
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
|
||||
auto const use_shared = shared_memory_bytes != 0;
|
||||
@@ -781,8 +809,8 @@ class ColumnSplitHelper {
|
||||
BitVector decision_bits{dh::ToSpan(decision_storage)};
|
||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
batch.offset.SetDevice(ctx_->Device());
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
std::size_t entry_start = 0;
|
||||
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
|
||||
|
||||
@@ -798,7 +826,7 @@ class ColumnSplitHelper {
|
||||
AllReduceBitVectors(&decision_storage, &missing_storage);
|
||||
|
||||
dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
|
||||
PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
|
||||
PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
|
||||
out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
|
||||
model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
|
||||
model.categories_tree_segments.ConstDeviceSpan(),
|
||||
@@ -813,15 +841,14 @@ class ColumnSplitHelper {
|
||||
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
dh::caching_device_vector<BitType>* missing_storage) const {
|
||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
|
||||
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
|
||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
|
||||
collective::Synchronize(ctx_->gpu_id);
|
||||
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
|
||||
}
|
||||
|
||||
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||
dh::caching_device_vector<BitType>* missing_storage,
|
||||
std::size_t total_bits) const {
|
||||
dh::caching_device_vector<BitType>* missing_storage,
|
||||
std::size_t total_bits) const {
|
||||
auto const size = BitVector::ComputeStorageSize(total_bits);
|
||||
if (decision_storage->size() < size) {
|
||||
decision_storage->resize(size);
|
||||
@@ -844,12 +871,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
size_t num_features,
|
||||
HostDeviceVector<bst_float>* predictions,
|
||||
size_t batch_offset, bool is_dense) const {
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
batch.offset.SetDevice(ctx_->Device());
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
const uint32_t BLOCK_THREADS = 128;
|
||||
size_t num_rows = batch.Size();
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
|
||||
size_t shared_memory_bytes =
|
||||
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
|
||||
bool use_shared = shared_memory_bytes != 0;
|
||||
@@ -905,12 +932,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
if (tree_end - tree_begin == 0) {
|
||||
return;
|
||||
}
|
||||
out_preds->SetDevice(ctx_->gpu_id);
|
||||
out_preds->SetDevice(ctx_->Device());
|
||||
auto const& info = dmat->Info();
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
|
||||
d_model.Init(model, tree_begin, tree_end, ctx_->Device());
|
||||
|
||||
if (dmat->Info().IsColumnSplit()) {
|
||||
if (info.IsColumnSplit()) {
|
||||
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
|
||||
return;
|
||||
}
|
||||
@@ -925,10 +952,10 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
} else {
|
||||
size_t batch_offset = 0;
|
||||
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||
dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
|
||||
dmat->Info().feature_types.SetDevice(ctx_->Device());
|
||||
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
|
||||
this->PredictInternal(
|
||||
page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
|
||||
page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
|
||||
d_model,
|
||||
out_preds,
|
||||
batch_offset);
|
||||
@@ -942,16 +969,15 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
|
||||
|
||||
~GPUPredictor() override {
|
||||
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
}
|
||||
}
|
||||
|
||||
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
|
||||
const gbm::GBTreeModel& model, uint32_t tree_begin,
|
||||
uint32_t tree_end = 0) const override {
|
||||
int device = ctx_->gpu_id;
|
||||
CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
|
||||
CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
|
||||
auto* out_preds = &predts->predictions;
|
||||
if (tree_end == 0) {
|
||||
tree_end = model.trees.size();
|
||||
@@ -969,9 +995,9 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
|
||||
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
|
||||
<< "Number of columns in data must equal to trained model.";
|
||||
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
|
||||
<< "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
|
||||
<< "but data is on: " << m->DeviceIdx();
|
||||
CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
|
||||
<< "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
|
||||
<< "but data is on: " << m->Device().Name();
|
||||
if (p_m) {
|
||||
p_m->Info().num_row_ = m->NumRows();
|
||||
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
|
||||
@@ -980,16 +1006,16 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
info.num_row_ = m->NumRows();
|
||||
this->InitOutPredictions(info, &(out_preds->predictions), model);
|
||||
}
|
||||
out_preds->predictions.SetDevice(m->DeviceIdx());
|
||||
out_preds->predictions.SetDevice(m->Device());
|
||||
|
||||
const uint32_t BLOCK_THREADS = 128;
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
|
||||
|
||||
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
|
||||
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
|
||||
size_t shared_memory_bytes =
|
||||
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
|
||||
d_model.Init(model, tree_begin, tree_end, m->Device());
|
||||
|
||||
bool use_shared = shared_memory_bytes != 0;
|
||||
size_t entry_start = 0;
|
||||
@@ -1039,10 +1065,10 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
if (tree_weights != nullptr) {
|
||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
out_contribs->SetDevice(ctx_->gpu_id);
|
||||
CHECK(!p_fmat->Info().IsColumnSplit())
|
||||
<< "Predict contribution support for column-wise data split is not yet implemented.";
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
out_contribs->SetDevice(ctx_->Device());
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||
}
|
||||
@@ -1060,12 +1086,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
||||
device_paths;
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
|
||||
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||
dh::device_vector<uint32_t> categories;
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
batch.offset.SetDevice(ctx_->Device());
|
||||
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||
model.learner_model_param->num_feature);
|
||||
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
||||
@@ -1074,7 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
dh::tend(phis));
|
||||
}
|
||||
// Add the base margin term to last column
|
||||
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
|
||||
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
|
||||
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
||||
|
||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||
@@ -1099,10 +1125,8 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
if (tree_weights != nullptr) {
|
||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||
}
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
|
||||
out_contribs->SetDevice(ctx_->gpu_id);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
out_contribs->SetDevice(ctx_->Device());
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||
}
|
||||
@@ -1121,12 +1145,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
||||
device_paths;
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
|
||||
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||
dh::device_vector<uint32_t> categories;
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
|
||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
batch.offset.SetDevice(ctx_->Device());
|
||||
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||
model.learner_model_param->num_feature);
|
||||
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
||||
@@ -1135,7 +1159,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
dh::tend(phis));
|
||||
}
|
||||
// Add the base margin term to last column
|
||||
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
|
||||
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
|
||||
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
||||
|
||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||
@@ -1160,30 +1184,35 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
||||
const gbm::GBTreeModel &model,
|
||||
unsigned tree_end) const override {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
|
||||
|
||||
const MetaInfo& info = p_fmat->Info();
|
||||
bst_row_t num_rows = info.num_row_;
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||
}
|
||||
predictions->SetDevice(ctx_->Device());
|
||||
predictions->Resize(num_rows * tree_end);
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, this->ctx_->Device());
|
||||
|
||||
if (info.IsColumnSplit()) {
|
||||
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr uint32_t kBlockThreads = 128;
|
||||
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
|
||||
info.num_col_, max_shared_memory_bytes);
|
||||
bool use_shared = shared_memory_bytes != 0;
|
||||
bst_feature_t num_features = info.num_col_;
|
||||
bst_row_t num_rows = info.num_row_;
|
||||
size_t entry_start = 0;
|
||||
|
||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||
}
|
||||
predictions->SetDevice(ctx_->gpu_id);
|
||||
predictions->Resize(num_rows * tree_end);
|
||||
DeviceModel d_model;
|
||||
d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
|
||||
|
||||
if (p_fmat->PageExists<SparsePage>()) {
|
||||
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||
batch.data.SetDevice(ctx_->gpu_id);
|
||||
batch.offset.SetDevice(ctx_->gpu_id);
|
||||
batch.data.SetDevice(ctx_->Device());
|
||||
batch.offset.SetDevice(ctx_->Device());
|
||||
bst_row_t batch_offset = 0;
|
||||
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||
model.learner_model_param->num_feature};
|
||||
@@ -1208,7 +1237,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
} else {
|
||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||
bst_row_t batch_offset = 0;
|
||||
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
|
||||
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
|
||||
size_t num_rows = batch.Size();
|
||||
auto grid =
|
||||
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
|
||||
@@ -1236,9 +1265,9 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
|
||||
private:
|
||||
/*! \brief Reconfigure the device when GPU is changed. */
|
||||
static size_t ConfigureDevice(int device) {
|
||||
if (device >= 0) {
|
||||
return dh::MaxSharedMemory(device);
|
||||
static size_t ConfigureDevice(DeviceOrd device) {
|
||||
if (device.IsCUDA()) {
|
||||
return dh::MaxSharedMemory(device.ordinal);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
|
||||
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
|
||||
|
||||
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
|
||||
if (ctx_->gpu_id >= 0) {
|
||||
out_preds->SetDevice(ctx_->gpu_id);
|
||||
if (ctx_->Device().IsCUDA()) {
|
||||
out_preds->SetDevice(ctx_->Device());
|
||||
}
|
||||
if (!base_margin->Empty()) {
|
||||
out_preds->Resize(n);
|
||||
|
||||
@@ -19,8 +19,7 @@
|
||||
#include "xgboost/linalg.h" // TensorView, Tensor, Constant
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace xgboost::tree {
|
||||
namespace cpu_impl {
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair,
|
||||
@@ -68,13 +67,12 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
|
||||
|
||||
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
||||
bst_target_t n_targets, linalg::Vector<float>* out) {
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(n_targets);
|
||||
|
||||
gpair.SetDevice(ctx->Device());
|
||||
auto gpair_t = gpair.View(ctx->Device());
|
||||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
|
||||
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
|
||||
ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
|
||||
: cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree
|
||||
|
||||
@@ -21,9 +21,7 @@
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
#include "xgboost/span.h" // span
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace cuda_impl {
|
||||
namespace xgboost::tree::cuda_impl {
|
||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
||||
auto n_targets = out.Size();
|
||||
@@ -56,7 +54,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
||||
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
||||
|
||||
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
|
||||
collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
|
||||
d_sum.Size() * 2);
|
||||
|
||||
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
|
||||
@@ -65,6 +63,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
||||
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
|
||||
});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::tree::cuda_impl
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user