Implement unified update prediction cache for (gpu_)hist. (#6860)
* Implement utilites for linalg. * Unify the update prediction cache functions. * Implement update prediction cache for multi-class gpu hist.
This commit is contained in:
@@ -190,6 +190,32 @@ void GBTree::ConfigureUpdaters() {
|
||||
}
|
||||
}
|
||||
|
||||
void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
|
||||
bst_group_t n_groups, bst_group_t group_id,
|
||||
HostDeviceVector<GradientPair> *out_gpair)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif
|
||||
|
||||
void CopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
|
||||
bst_group_t n_groups, bst_group_t group_id,
|
||||
HostDeviceVector<GradientPair> *out_gpair) {
|
||||
if (in_gpair->DeviceIdx() != GenericParameter::kCpuId) {
|
||||
GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
|
||||
} else {
|
||||
std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
|
||||
auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
|
||||
const auto &gpair_h = in_gpair->ConstHostVector();
|
||||
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||
tmp_h[i] = gpair_h[i * n_groups + group_id];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void GBTree::DoBoost(DMatrix* p_fmat,
|
||||
HostDeviceVector<GradientPair>* in_gpair,
|
||||
PredictionCacheEntry* predt) {
|
||||
@@ -197,39 +223,44 @@ void GBTree::DoBoost(DMatrix* p_fmat,
|
||||
const int ngroup = model_.learner_model_param->num_output_group;
|
||||
ConfigureWithKnownData(this->cfg_, p_fmat);
|
||||
monitor_.Start("BoostNewTrees");
|
||||
auto* out = &predt->predictions;
|
||||
// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
|
||||
// `gpu_id` be the single source of determining what algorithms to run, but that will
|
||||
// break a lots of existing code.
|
||||
auto device = tparam_.tree_method != TreeMethod::kGPUHist
|
||||
? GenericParameter::kCpuId
|
||||
: in_gpair->DeviceIdx();
|
||||
auto out = MatrixView<float>(
|
||||
&predt->predictions,
|
||||
{p_fmat->Info().num_row_, static_cast<size_t>(ngroup)}, device);
|
||||
CHECK_NE(ngroup, 0);
|
||||
if (ngroup == 1) {
|
||||
std::vector<std::unique_ptr<RegTree> > ret;
|
||||
std::vector<std::unique_ptr<RegTree>> ret;
|
||||
BoostNewTrees(in_gpair, p_fmat, 0, &ret);
|
||||
const size_t num_new_trees = ret.size();
|
||||
new_trees.push_back(std::move(ret));
|
||||
if (updaters_.size() > 0 && num_new_trees == 1 && out->Size() > 0 &&
|
||||
updaters_.back()->UpdatePredictionCache(p_fmat, out)) {
|
||||
auto v_predt = VectorView<float>{out, 0};
|
||||
if (updaters_.size() > 0 && num_new_trees == 1 &&
|
||||
predt->predictions.Size() > 0 &&
|
||||
updaters_.back()->UpdatePredictionCache(p_fmat, v_predt)) {
|
||||
predt->Update(1);
|
||||
}
|
||||
} else {
|
||||
CHECK_EQ(in_gpair->Size() % ngroup, 0U)
|
||||
<< "must have exactly ngroup * nrow gpairs";
|
||||
// TODO(canonizer): perform this on GPU if HostDeviceVector has device set.
|
||||
HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
|
||||
GradientPair(),
|
||||
in_gpair->DeviceIdx());
|
||||
const auto& gpair_h = in_gpair->ConstHostVector();
|
||||
auto nsize = static_cast<bst_omp_uint>(tmp.Size());
|
||||
bool update_predict = true;
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
std::vector<GradientPair>& tmp_h = tmp.HostVector();
|
||||
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||
tmp_h[i] = gpair_h[i * ngroup + gid];
|
||||
});
|
||||
CopyGradient(in_gpair, ngroup, gid, &tmp);
|
||||
std::vector<std::unique_ptr<RegTree> > ret;
|
||||
BoostNewTrees(&tmp, p_fmat, gid, &ret);
|
||||
const size_t num_new_trees = ret.size();
|
||||
new_trees.push_back(std::move(ret));
|
||||
auto* out = &predt->predictions;
|
||||
if (!(updaters_.size() > 0 && out->Size() > 0 && num_new_trees == 1 &&
|
||||
updaters_.back()->UpdatePredictionCacheMulticlass(p_fmat, out, gid, ngroup))) {
|
||||
auto v_predt = VectorView<float>{out, static_cast<size_t>(gid)};
|
||||
if (!(updaters_.size() > 0 && predt->predictions.Size() > 0 &&
|
||||
num_new_trees == 1 &&
|
||||
updaters_.back()->UpdatePredictionCache(p_fmat, v_predt))) {
|
||||
update_predict = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,10 +2,28 @@
|
||||
* Copyright 2021 by Contributors
|
||||
*/
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/generic_parameters.h"
|
||||
#include "xgboost/linalg.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
|
||||
bst_group_t n_groups, bst_group_t group_id,
|
||||
HostDeviceVector<GradientPair> *out_gpair) {
|
||||
MatrixView<GradientPair const> in{
|
||||
in_gpair,
|
||||
{n_groups, 1ul},
|
||||
{in_gpair->Size() / n_groups, static_cast<size_t>(n_groups)},
|
||||
in_gpair->DeviceIdx()};
|
||||
auto v_in = VectorView<GradientPair const>{in, group_id};
|
||||
out_gpair->Resize(v_in.Size());
|
||||
auto d_out = out_gpair->DeviceSpan();
|
||||
dh::LaunchN(dh::CurrentDevice(), v_in.Size(),
|
||||
[=] __device__(size_t i) { d_out[i] = v_in[i]; });
|
||||
}
|
||||
|
||||
void GPUDartPredictInc(common::Span<float> out_predts,
|
||||
common::Span<float> predts, float tree_w, size_t n_rows,
|
||||
bst_group_t n_groups, bst_group_t group) {
|
||||
|
||||
Reference in New Issue
Block a user