merge latest change from upstream
This commit is contained in:
@@ -89,19 +89,15 @@ Coll *FederatedColl::MakeCUDAVar() {
|
||||
|
||||
[[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span<std::int8_t> data,
|
||||
std::int32_t root) {
|
||||
if (comm.Rank() == root) {
|
||||
return BroadcastImpl(comm, &this->sequence_number_, data, root);
|
||||
} else {
|
||||
return BroadcastImpl(comm, &this->sequence_number_, data, root);
|
||||
}
|
||||
return BroadcastImpl(comm, &this->sequence_number_, data, root);
|
||||
}
|
||||
|
||||
[[nodiscard]] Result FederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data,
|
||||
std::int64_t size) {
|
||||
[[nodiscard]] Result FederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data) {
|
||||
using namespace federated; // NOLINT
|
||||
auto fed = dynamic_cast<FederatedComm const *>(&comm);
|
||||
CHECK(fed);
|
||||
auto stub = fed->Handle();
|
||||
auto size = data.size_bytes() / comm.World();
|
||||
|
||||
auto offset = comm.Rank() * size;
|
||||
auto segment = data.subspan(offset, size);
|
||||
|
||||
@@ -53,8 +53,7 @@ Coll *FederatedColl::MakeCUDAVar() {
|
||||
};
|
||||
}
|
||||
|
||||
[[nodiscard]] Result CUDAFederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data,
|
||||
std::int64_t size) {
|
||||
[[nodiscard]] Result CUDAFederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data) {
|
||||
auto cufed = dynamic_cast<CUDAFederatedComm const *>(&comm);
|
||||
CHECK(cufed);
|
||||
std::vector<std::int8_t> h_data(data.size());
|
||||
@@ -63,7 +62,7 @@ Coll *FederatedColl::MakeCUDAVar() {
|
||||
return GetCUDAResult(
|
||||
cudaMemcpy(h_data.data(), data.data(), data.size(), cudaMemcpyDeviceToHost));
|
||||
} << [&] {
|
||||
return p_impl_->Allgather(comm, common::Span{h_data.data(), h_data.size()}, size);
|
||||
return p_impl_->Allgather(comm, common::Span{h_data.data(), h_data.size()});
|
||||
} << [&] {
|
||||
return GetCUDAResult(cudaMemcpyAsync(data.data(), h_data.data(), data.size(),
|
||||
cudaMemcpyHostToDevice, cufed->Stream()));
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost contributors
|
||||
* Copyright 2023-2024, XGBoost contributors
|
||||
*/
|
||||
#include "../../src/collective/comm.h" // for Comm, Coll
|
||||
#include "federated_coll.h" // for FederatedColl
|
||||
@@ -16,8 +16,7 @@ class CUDAFederatedColl : public Coll {
|
||||
ArrayInterfaceHandler::Type type, Op op) override;
|
||||
[[nodiscard]] Result Broadcast(Comm const &comm, common::Span<std::int8_t> data,
|
||||
std::int32_t root) override;
|
||||
[[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data,
|
||||
std::int64_t size) override;
|
||||
[[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data) override;
|
||||
[[nodiscard]] Result AllgatherV(Comm const &comm, common::Span<std::int8_t const> data,
|
||||
common::Span<std::int64_t const> sizes,
|
||||
common::Span<std::int64_t> recv_segments,
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost contributors
|
||||
* Copyright 2023-2024, XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include "../../src/collective/coll.h" // for Coll
|
||||
#include "../../src/collective/comm.h" // for Comm
|
||||
#include "../../src/common/io.h" // for ReadAll
|
||||
#include "../../src/common/json_utils.h" // for OptionalArg
|
||||
#include "xgboost/json.h" // for Json
|
||||
|
||||
namespace xgboost::collective {
|
||||
class FederatedColl : public Coll {
|
||||
@@ -20,8 +17,7 @@ class FederatedColl : public Coll {
|
||||
ArrayInterfaceHandler::Type type, Op op) override;
|
||||
[[nodiscard]] Result Broadcast(Comm const &comm, common::Span<std::int8_t> data,
|
||||
std::int32_t root) override;
|
||||
[[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data,
|
||||
std::int64_t) override;
|
||||
[[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data) override;
|
||||
[[nodiscard]] Result AllgatherV(Comm const &comm, common::Span<std::int8_t const> data,
|
||||
common::Span<std::int64_t const> sizes,
|
||||
common::Span<std::int64_t> recv_segments,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost Contributors
|
||||
* Copyright 2023-2024, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "../../src/common/device_helpers.cuh" // for CUDAStreamView
|
||||
#include "federated_comm.h" // for FederatedComm
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
class CUDAFederatedComm : public FederatedComm {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2023, XGBoost contributors
|
||||
* Copyright 2023-2024, XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <string> // for string
|
||||
|
||||
#include "../../src/collective/comm.h" // for HostComm
|
||||
#include "../../src/common/json_utils.h" // for OptionalArg
|
||||
#include "xgboost/json.h"
|
||||
|
||||
namespace xgboost::collective {
|
||||
@@ -51,6 +50,10 @@ class FederatedComm : public HostComm {
|
||||
std::int32_t rank) {
|
||||
this->Init(host, port, world, rank, {}, {}, {});
|
||||
}
|
||||
[[nodiscard]] Result Shutdown() final {
|
||||
this->ResetState();
|
||||
return Success();
|
||||
}
|
||||
~FederatedComm() override { stub_.reset(); }
|
||||
|
||||
[[nodiscard]] std::shared_ptr<Channel> Chan(std::int32_t) const override {
|
||||
@@ -65,5 +68,13 @@ class FederatedComm : public HostComm {
|
||||
[[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
|
||||
|
||||
[[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
|
||||
/**
|
||||
* @brief Get a string ID for the current process.
|
||||
*/
|
||||
[[nodiscard]] Result ProcessorName(std::string* out) const final {
|
||||
auto rank = this->Rank();
|
||||
*out = "rank:" + std::to_string(rank);
|
||||
return Success();
|
||||
};
|
||||
};
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -1,22 +1,18 @@
|
||||
/**
|
||||
* Copyright 2022-2023, XGBoost contributors
|
||||
* Copyright 2022-2024, XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <federated.old.grpc.pb.h>
|
||||
|
||||
#include <cstdint> // for int32_t
|
||||
#include <future> // for future
|
||||
|
||||
#include "../../src/collective/in_memory_handler.h"
|
||||
#include "../../src/collective/tracker.h" // for Tracker
|
||||
#include "xgboost/collective/result.h" // for Result
|
||||
|
||||
namespace xgboost::federated {
|
||||
class FederatedService final : public Federated::Service {
|
||||
public:
|
||||
explicit FederatedService(std::int32_t world_size)
|
||||
: handler_{static_cast<std::size_t>(world_size)} {}
|
||||
explicit FederatedService(std::int32_t world_size) : handler_{world_size} {}
|
||||
|
||||
grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
|
||||
AllgatherReply* reply) override;
|
||||
|
||||
@@ -125,14 +125,14 @@ Result FederatedTracker::Shutdown() {
|
||||
|
||||
[[nodiscard]] Json FederatedTracker::WorkerArgs() const {
|
||||
auto rc = this->WaitUntilReady();
|
||||
CHECK(rc.OK()) << rc.Report();
|
||||
SafeColl(rc);
|
||||
|
||||
std::string host;
|
||||
rc = GetHostAddress(&host);
|
||||
CHECK(rc.OK());
|
||||
Json args{Object{}};
|
||||
args["DMLC_TRACKER_URI"] = String{host};
|
||||
args["DMLC_TRACKER_PORT"] = this->Port();
|
||||
args["dmlc_tracker_uri"] = String{host};
|
||||
args["dmlc_tracker_port"] = this->Port();
|
||||
return args;
|
||||
}
|
||||
} // namespace xgboost::collective
|
||||
|
||||
@@ -17,8 +17,7 @@ namespace xgboost::collective {
|
||||
namespace federated {
|
||||
class FederatedService final : public Federated::Service {
|
||||
public:
|
||||
explicit FederatedService(std::int32_t world_size)
|
||||
: handler_{static_cast<std::size_t>(world_size)} {}
|
||||
explicit FederatedService(std::int32_t world_size) : handler_{world_size} {}
|
||||
|
||||
grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
|
||||
AllgatherReply* reply) override;
|
||||
|
||||
334
plugin/sycl/common/hist_util.cc
Normal file
334
plugin/sycl/common/hist_util.cc
Normal file
@@ -0,0 +1,334 @@
|
||||
/*!
|
||||
* Copyright 2017-2023 by Contributors
|
||||
* \file hist_util.cc
|
||||
*/
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "../data/gradient_index.h"
|
||||
#include "hist_util.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
/*!
|
||||
* \brief Fill histogram with zeroes
|
||||
*/
|
||||
template<typename GradientSumT>
|
||||
void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
||||
size_t size, ::sycl::event* event) {
|
||||
*event = qu.fill(hist->Begin(),
|
||||
xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
|
||||
}
|
||||
template void InitHist(::sycl::queue qu,
|
||||
GHistRow<float, MemoryType::on_device>* hist,
|
||||
size_t size, ::sycl::event* event);
|
||||
template void InitHist(::sycl::queue qu,
|
||||
GHistRow<double, MemoryType::on_device>* hist,
|
||||
size_t size, ::sycl::event* event);
|
||||
|
||||
/*!
|
||||
* \brief Compute Subtraction: dst = src1 - src2
|
||||
*/
|
||||
template<typename GradientSumT>
|
||||
::sycl::event SubtractionHist(::sycl::queue qu,
|
||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
||||
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
||||
size_t size, ::sycl::event event_priv) {
|
||||
GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
|
||||
const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
|
||||
const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());
|
||||
|
||||
auto event_final = qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event_priv);
|
||||
cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
|
||||
const size_t i = pid.get_id(0);
|
||||
pdst[i] = psrc1[i] - psrc2[i];
|
||||
});
|
||||
});
|
||||
return event_final;
|
||||
}
|
||||
template ::sycl::event SubtractionHist(::sycl::queue qu,
|
||||
GHistRow<float, MemoryType::on_device>* dst,
|
||||
const GHistRow<float, MemoryType::on_device>& src1,
|
||||
const GHistRow<float, MemoryType::on_device>& src2,
|
||||
size_t size, ::sycl::event event_priv);
|
||||
template ::sycl::event SubtractionHist(::sycl::queue qu,
|
||||
GHistRow<double, MemoryType::on_device>* dst,
|
||||
const GHistRow<double, MemoryType::on_device>& src1,
|
||||
const GHistRow<double, MemoryType::on_device>& src2,
|
||||
size_t size, ::sycl::event event_priv);
|
||||
|
||||
// Kernel with buffer using
|
||||
template<typename FPType, typename BinIdxType, bool isDense>
|
||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event_priv) {
|
||||
const size_t size = row_indices.Size();
|
||||
const size_t* rid = row_indices.begin;
|
||||
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
||||
const GradientPair::ValueT* pgh =
|
||||
reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
|
||||
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
||||
const uint32_t* offsets = gmat.index.Offset();
|
||||
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
|
||||
const size_t nbins = gmat.nbins;
|
||||
|
||||
const size_t max_work_group_size =
|
||||
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
||||
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
||||
|
||||
const size_t max_nblocks = hist_buffer->Size() / (nbins * 2);
|
||||
const size_t min_block_size = 128;
|
||||
size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size));
|
||||
const size_t block_size = size / nblocks + !!(size % nblocks);
|
||||
FPType* hist_buffer_data = reinterpret_cast<FPType*>(hist_buffer->Data());
|
||||
|
||||
auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv);
|
||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event_fill);
|
||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
|
||||
::sycl::range<2>(1, work_group_size)),
|
||||
[=](::sycl::nd_item<2> pid) {
|
||||
size_t block = pid.get_global_id(0);
|
||||
size_t feat = pid.get_global_id(1);
|
||||
|
||||
FPType* hist_local = hist_buffer_data + block * nbins * 2;
|
||||
for (size_t idx = 0; idx < block_size; ++idx) {
|
||||
size_t i = block * block_size + idx;
|
||||
if (i < size) {
|
||||
const size_t icol_start = n_columns * rid[i];
|
||||
const size_t idx_gh = rid[i];
|
||||
|
||||
pid.barrier(::sycl::access::fence_space::local_space);
|
||||
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
||||
|
||||
for (size_t j = feat; j < n_columns; j += work_group_size) {
|
||||
uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
|
||||
if constexpr (isDense) {
|
||||
idx_bin += offsets[j];
|
||||
}
|
||||
if (idx_bin < nbins) {
|
||||
hist_local[2 * idx_bin] += pgh[2 * idx_gh];
|
||||
hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
auto event_save = qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event_main);
|
||||
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
|
||||
size_t idx_bin = pid.get_id(0);
|
||||
|
||||
FPType gsum = 0.0f;
|
||||
FPType hsum = 0.0f;
|
||||
|
||||
for (size_t j = 0; j < nblocks; ++j) {
|
||||
gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin];
|
||||
hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1];
|
||||
}
|
||||
|
||||
hist_data[2 * idx_bin] = gsum;
|
||||
hist_data[2 * idx_bin + 1] = hsum;
|
||||
});
|
||||
});
|
||||
return event_save;
|
||||
}
|
||||
|
||||
// Kernel with atomic using
|
||||
template<typename FPType, typename BinIdxType, bool isDense>
|
||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist,
|
||||
::sycl::event event_priv) {
|
||||
const size_t size = row_indices.Size();
|
||||
const size_t* rid = row_indices.begin;
|
||||
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
||||
const GradientPair::ValueT* pgh =
|
||||
reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
|
||||
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
||||
const uint32_t* offsets = gmat.index.Offset();
|
||||
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
|
||||
const size_t nbins = gmat.nbins;
|
||||
|
||||
const size_t max_work_group_size =
|
||||
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
||||
const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
||||
|
||||
auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
|
||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event_fill);
|
||||
cgh.parallel_for<>(::sycl::range<2>(size, feat_local),
|
||||
[=](::sycl::item<2> pid) {
|
||||
size_t i = pid.get_id(0);
|
||||
size_t feat = pid.get_id(1);
|
||||
|
||||
const size_t icol_start = n_columns * rid[i];
|
||||
const size_t idx_gh = rid[i];
|
||||
|
||||
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
||||
|
||||
for (size_t j = feat; j < n_columns; j += feat_local) {
|
||||
uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
|
||||
if constexpr (isDense) {
|
||||
idx_bin += offsets[j];
|
||||
}
|
||||
if (idx_bin < nbins) {
|
||||
AtomicRef<FPType> gsum(hist_data[2 * idx_bin]);
|
||||
AtomicRef<FPType> hsum(hist_data[2 * idx_bin + 1]);
|
||||
gsum.fetch_add(pgh[2 * idx_gh]);
|
||||
hsum.fetch_add(pgh[2 * idx_gh + 1]);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
return event_main;
|
||||
}
|
||||
|
||||
template<typename FPType, typename BinIdxType>
|
||||
::sycl::event BuildHistDispatchKernel(
|
||||
::sycl::queue qu,
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist,
|
||||
bool isDense,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event events_priv,
|
||||
bool force_atomic_use) {
|
||||
const size_t size = row_indices.Size();
|
||||
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
||||
const size_t nbins = gmat.nbins;
|
||||
|
||||
// max cycle size, while atomics are still effective
|
||||
const size_t max_cycle_size_atomics = nbins;
|
||||
const size_t cycle_size = size;
|
||||
|
||||
// TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
|
||||
bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
|
||||
|
||||
// force_atomic_use flag is used only for testing
|
||||
use_atomic = use_atomic || force_atomic_use;
|
||||
if (!use_atomic) {
|
||||
if (isDense) {
|
||||
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair_device, row_indices,
|
||||
gmat, hist, hist_buffer,
|
||||
events_priv);
|
||||
} else {
|
||||
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair_device, row_indices,
|
||||
gmat, hist, hist_buffer,
|
||||
events_priv);
|
||||
}
|
||||
} else {
|
||||
if (isDense) {
|
||||
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair_device, row_indices,
|
||||
gmat, hist, events_priv);
|
||||
} else {
|
||||
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair_device, row_indices,
|
||||
gmat, hist, events_priv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FPType>
|
||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat, const bool isDense,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist,
|
||||
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event_priv,
|
||||
bool force_atomic_use) {
|
||||
const bool is_dense = isDense;
|
||||
switch (gmat.index.GetBinTypeSize()) {
|
||||
case BinTypeSize::kUint8BinsTypeSize:
|
||||
return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair_device, row_indices,
|
||||
gmat, hist, is_dense, hist_buffer,
|
||||
event_priv, force_atomic_use);
|
||||
break;
|
||||
case BinTypeSize::kUint16BinsTypeSize:
|
||||
return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair_device, row_indices,
|
||||
gmat, hist, is_dense, hist_buffer,
|
||||
event_priv, force_atomic_use);
|
||||
break;
|
||||
case BinTypeSize::kUint32BinsTypeSize:
|
||||
return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair_device, row_indices,
|
||||
gmat, hist, is_dense, hist_buffer,
|
||||
event_priv, force_atomic_use);
|
||||
break;
|
||||
default:
|
||||
CHECK(false); // no default behavior
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
::sycl::event GHistBuilder<GradientSumT>::BuildHist(
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix &gmat,
|
||||
GHistRowT<MemoryType::on_device>* hist,
|
||||
bool isDense,
|
||||
GHistRowT<MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event_priv,
|
||||
bool force_atomic_use) {
|
||||
return BuildHistKernel<GradientSumT>(qu_, gpair_device, row_indices, gmat,
|
||||
isDense, hist, hist_buffer, event_priv,
|
||||
force_atomic_use);
|
||||
}
|
||||
|
||||
template
|
||||
::sycl::event GHistBuilder<float>::BuildHist(
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRow<float, MemoryType::on_device>* hist,
|
||||
bool isDense,
|
||||
GHistRow<float, MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event_priv,
|
||||
bool force_atomic_use);
|
||||
template
|
||||
::sycl::event GHistBuilder<double>::BuildHist(
|
||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRow<double, MemoryType::on_device>* hist,
|
||||
bool isDense,
|
||||
GHistRow<double, MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event_priv,
|
||||
bool force_atomic_use);
|
||||
|
||||
template<typename GradientSumT>
|
||||
void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT<MemoryType::on_device>* self,
|
||||
const GHistRowT<MemoryType::on_device>& sibling,
|
||||
const GHistRowT<MemoryType::on_device>& parent) {
|
||||
const size_t size = self->Size();
|
||||
CHECK_EQ(sibling.Size(), size);
|
||||
CHECK_EQ(parent.Size(), size);
|
||||
|
||||
SubtractionHist(qu_, self, parent, sibling, size, ::sycl::event());
|
||||
}
|
||||
template
|
||||
void GHistBuilder<float>::SubtractionTrick(GHistRow<float, MemoryType::on_device>* self,
|
||||
const GHistRow<float, MemoryType::on_device>& sibling,
|
||||
const GHistRow<float, MemoryType::on_device>& parent);
|
||||
template
|
||||
void GHistBuilder<double>::SubtractionTrick(GHistRow<double, MemoryType::on_device>* self,
|
||||
const GHistRow<double, MemoryType::on_device>& sibling,
|
||||
const GHistRow<double, MemoryType::on_device>& parent);
|
||||
} // namespace common
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
89
plugin/sycl/common/hist_util.h
Normal file
89
plugin/sycl/common/hist_util.h
Normal file
@@ -0,0 +1,89 @@
|
||||
/*!
|
||||
* Copyright 2017-2023 by Contributors
|
||||
* \file hist_util.h
|
||||
*/
|
||||
#ifndef PLUGIN_SYCL_COMMON_HIST_UTIL_H_
|
||||
#define PLUGIN_SYCL_COMMON_HIST_UTIL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
|
||||
#include "../data.h"
|
||||
#include "row_set.h"
|
||||
|
||||
#include "../../src/common/hist_util.h"
|
||||
#include "../data/gradient_index.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
template<typename GradientSumT, MemoryType memory_type = MemoryType::shared>
|
||||
using GHistRow = USMVector<xgboost::detail::GradientPairInternal<GradientSumT>, memory_type>;
|
||||
|
||||
using BinTypeSize = ::xgboost::common::BinTypeSize;
|
||||
|
||||
class ColumnMatrix;
|
||||
|
||||
/*!
|
||||
* \brief Fill histogram with zeroes
|
||||
*/
|
||||
template<typename GradientSumT>
|
||||
void InitHist(::sycl::queue qu,
|
||||
GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
||||
size_t size, ::sycl::event* event);
|
||||
|
||||
/*!
|
||||
* \brief Compute subtraction: dst = src1 - src2
|
||||
*/
|
||||
template<typename GradientSumT>
|
||||
::sycl::event SubtractionHist(::sycl::queue qu,
|
||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
||||
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
||||
size_t size, ::sycl::event event_priv);
|
||||
|
||||
/*!
|
||||
* \brief Builder for histograms of gradient statistics
|
||||
*/
|
||||
template<typename GradientSumT>
|
||||
class GHistBuilder {
|
||||
public:
|
||||
template<MemoryType memory_type = MemoryType::shared>
|
||||
using GHistRowT = GHistRow<GradientSumT, memory_type>;
|
||||
|
||||
GHistBuilder() = default;
|
||||
GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
|
||||
|
||||
// Construct a histogram via histogram aggregation
|
||||
::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||
const RowSetCollection::Elem& row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
GHistRowT<MemoryType::on_device>* HistCollection,
|
||||
bool isDense,
|
||||
GHistRowT<MemoryType::on_device>* hist_buffer,
|
||||
::sycl::event event,
|
||||
bool force_atomic_use = false);
|
||||
|
||||
// Construct a histogram via subtraction trick
|
||||
void SubtractionTrick(GHistRowT<MemoryType::on_device>* self,
|
||||
const GHistRowT<MemoryType::on_device>& sibling,
|
||||
const GHistRowT<MemoryType::on_device>& parent);
|
||||
|
||||
uint32_t GetNumBins() const {
|
||||
return nbins_;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief Number of all bins over all features */
|
||||
uint32_t nbins_ { 0 };
|
||||
|
||||
::sycl::queue qu_;
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
#endif // PLUGIN_SYCL_COMMON_HIST_UTIL_H_
|
||||
55
plugin/sycl/tree/updater_quantile_hist.cc
Normal file
55
plugin/sycl/tree/updater_quantile_hist.cc
Normal file
@@ -0,0 +1,55 @@
|
||||
/*!
|
||||
* Copyright 2017-2024 by Contributors
|
||||
* \file updater_quantile_hist.cc
|
||||
*/
|
||||
#include <vector>
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||
#include "xgboost/tree_updater.h"
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
#include "updater_quantile_hist.h"
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_quantile_hist_sycl);
|
||||
|
||||
DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
|
||||
|
||||
void QuantileHistMaker::Configure(const Args& args) {
|
||||
const DeviceOrd device_spec = ctx_->Device();
|
||||
qu_ = device_manager.GetQueue(device_spec);
|
||||
|
||||
param_.UpdateAllowUnknown(args);
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
}
|
||||
|
||||
void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
|
||||
linalg::Matrix<GradientPair>* gpair,
|
||||
DMatrix *dmat,
|
||||
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree *> &trees) {
|
||||
LOG(FATAL) << "Not Implemented yet";
|
||||
}
|
||||
|
||||
bool QuantileHistMaker::UpdatePredictionCache(const DMatrix* data,
|
||||
linalg::MatrixView<float> out_preds) {
|
||||
LOG(FATAL) << "Not Implemented yet";
|
||||
}
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker_sycl")
|
||||
.describe("Grow tree using quantized histogram with SYCL.")
|
||||
.set_body(
|
||||
[](Context const* ctx, ObjInfo const * task) {
|
||||
return new QuantileHistMaker(ctx, task);
|
||||
});
|
||||
} // namespace tree
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
91
plugin/sycl/tree/updater_quantile_hist.h
Normal file
91
plugin/sycl/tree/updater_quantile_hist.h
Normal file
@@ -0,0 +1,91 @@
|
||||
/*!
|
||||
* Copyright 2017-2024 by Contributors
|
||||
* \file updater_quantile_hist.h
|
||||
*/
|
||||
#ifndef PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
|
||||
#define PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
|
||||
|
||||
#include <dmlc/timer.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "../data/gradient_index.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/row_set.h"
|
||||
#include "../common/partition_builder.h"
|
||||
#include "split_evaluator.h"
|
||||
#include "../device_manager.h"
|
||||
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/json.h"
|
||||
#include "../../src/tree/constraints.h"
|
||||
#include "../../src/common/random.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace tree {
|
||||
|
||||
// training parameters specific to this algorithm
|
||||
struct HistMakerTrainParam
|
||||
: public XGBoostParameter<HistMakerTrainParam> {
|
||||
bool single_precision_histogram = false;
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
|
||||
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
|
||||
"Use single precision to build histograms.");
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief construct a tree using quantized feature values with SYCL backend*/
|
||||
class QuantileHistMaker: public TreeUpdater {
|
||||
public:
|
||||
QuantileHistMaker(Context const* ctx, ObjInfo const * task) :
|
||||
TreeUpdater(ctx), task_{task} {
|
||||
updater_monitor_.Init("SYCLQuantileHistMaker");
|
||||
}
|
||||
void Configure(const Args& args) override;
|
||||
|
||||
void Update(xgboost::tree::TrainParam const *param,
|
||||
linalg::Matrix<GradientPair>* gpair,
|
||||
DMatrix* dmat,
|
||||
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree*>& trees) override;
|
||||
|
||||
bool UpdatePredictionCache(const DMatrix* data,
|
||||
linalg::MatrixView<float> out_preds) override;
|
||||
|
||||
void LoadConfig(Json const& in) override {
|
||||
auto const& config = get<Object const>(in);
|
||||
FromJson(config.at("train_param"), &this->param_);
|
||||
FromJson(config.at("sycl_hist_train_param"), &this->hist_maker_param_);
|
||||
}
|
||||
|
||||
void SaveConfig(Json* p_out) const override {
|
||||
auto& out = *p_out;
|
||||
out["train_param"] = ToJson(param_);
|
||||
out["sycl_hist_train_param"] = ToJson(hist_maker_param_);
|
||||
}
|
||||
|
||||
char const* Name() const override {
|
||||
return "grow_quantile_histmaker_sycl";
|
||||
}
|
||||
|
||||
protected:
|
||||
HistMakerTrainParam hist_maker_param_;
|
||||
// training parameter
|
||||
xgboost::tree::TrainParam param_;
|
||||
|
||||
xgboost::common::Monitor updater_monitor_;
|
||||
|
||||
::sycl::queue qu_;
|
||||
DeviceManager device_manager;
|
||||
ObjInfo const *task_{nullptr};
|
||||
};
|
||||
|
||||
|
||||
} // namespace tree
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
|
||||
Reference in New Issue
Block a user