194 lines
7.6 KiB
Plaintext
194 lines
7.6 KiB
Plaintext
/**
|
|
* Copyright 2022-2023 by XGBoost Contributors
|
|
*/
|
|
#include <thrust/sort.h>
|
|
|
|
#include <cstdint> // std::int32_t
|
|
#include <cub/cub.cuh> // NOLINT
|
|
|
|
#include "../common/cuda_context.cuh" // CUDAContext
|
|
#include "../common/device_helpers.cuh"
|
|
#include "../common/stats.cuh"
|
|
#include "adaptive.h"
|
|
#include "xgboost/context.h"
|
|
|
|
namespace xgboost {
|
|
namespace obj {
|
|
namespace detail {
|
|
void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
|
|
dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
|
|
HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
|
|
// copy position to buffer
|
|
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
|
auto cuctx = ctx->CUDACtx();
|
|
size_t n_samples = position.size();
|
|
dh::device_vector<bst_node_t> sorted_position(position.size());
|
|
dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
|
|
position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
|
|
|
|
p_ridx->resize(position.size());
|
|
dh::Iota(dh::ToSpan(*p_ridx));
|
|
// sort row index according to node index
|
|
thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
|
|
sorted_position.begin() + n_samples, p_ridx->begin());
|
|
size_t beg_pos =
|
|
thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
|
|
[] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
|
|
sorted_position.cbegin();
|
|
if (beg_pos == sorted_position.size()) {
|
|
auto& leaf = p_nidx->HostVector();
|
|
tree.WalkTree([&](bst_node_t nidx) {
|
|
if (tree[nidx].IsLeaf()) {
|
|
leaf.push_back(nidx);
|
|
}
|
|
return true;
|
|
});
|
|
return;
|
|
}
|
|
|
|
size_t n_leaf = tree.GetNumLeaves();
|
|
size_t max_n_unique = n_leaf;
|
|
|
|
dh::caching_device_vector<size_t> counts_out(max_n_unique + 1, 0);
|
|
auto d_counts_out = dh::ToSpan(counts_out).subspan(0, max_n_unique);
|
|
auto d_num_runs_out = dh::ToSpan(counts_out).subspan(max_n_unique, 1);
|
|
dh::caching_device_vector<bst_node_t> unique_out(max_n_unique, 0);
|
|
auto d_unique_out = dh::ToSpan(unique_out);
|
|
|
|
size_t nbytes{0};
|
|
auto begin_it = sorted_position.begin() + beg_pos;
|
|
dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
nullptr, nbytes, begin_it, unique_out.data().get(), counts_out.data().get(),
|
|
d_num_runs_out.data(), n_samples - beg_pos, ctx->CUDACtx()->Stream()));
|
|
dh::TemporaryArray<char> temp(nbytes);
|
|
dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(
|
|
temp.data().get(), nbytes, begin_it, unique_out.data().get(), counts_out.data().get(),
|
|
d_num_runs_out.data(), n_samples - beg_pos, ctx->CUDACtx()->Stream()));
|
|
|
|
dh::PinnedMemory pinned_pool;
|
|
auto pinned = pinned_pool.GetSpan<char>(sizeof(size_t) + sizeof(bst_node_t));
|
|
dh::CUDAStream copy_stream;
|
|
size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());
|
|
|
|
dh::CUDAEvent e;
|
|
e.Record(cuctx->Stream());
|
|
copy_stream.View().Wait(e);
|
|
// flag for whether there's ignored position
|
|
bst_node_t* h_first_unique =
|
|
reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
|
|
dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
|
|
cudaMemcpyDeviceToHost, copy_stream.View()));
|
|
dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
|
|
cudaMemcpyDeviceToHost, copy_stream.View()));
|
|
|
|
/**
|
|
* copy node index (leaf index)
|
|
*/
|
|
auto& nidx = *p_nidx;
|
|
auto& nptr = *p_nptr;
|
|
nidx.SetDevice(ctx->gpu_id);
|
|
nidx.Resize(n_leaf);
|
|
auto d_node_idx = nidx.DeviceSpan();
|
|
|
|
nptr.SetDevice(ctx->gpu_id);
|
|
nptr.Resize(n_leaf + 1, 0);
|
|
auto d_node_ptr = nptr.DeviceSpan();
|
|
|
|
dh::LaunchN(n_leaf, [=] XGBOOST_DEVICE(size_t i) {
|
|
if (i >= d_num_runs_out[0]) {
|
|
// d_num_runs_out <= max_n_unique
|
|
// this omits all the leaf that are empty. A leaf can be empty when there's
|
|
// missing data, which can be caused by sparse input and distributed training.
|
|
return;
|
|
}
|
|
d_node_idx[i] = d_unique_out[i];
|
|
d_node_ptr[i + 1] = d_counts_out[i];
|
|
if (i == 0) {
|
|
d_node_ptr[0] = beg_pos;
|
|
}
|
|
});
|
|
thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
|
|
dh::tbegin(d_node_ptr));
|
|
copy_stream.View().Sync();
|
|
CHECK_GT(*h_num_runs, 0);
|
|
CHECK_LE(*h_num_runs, n_leaf);
|
|
|
|
if (*h_num_runs < n_leaf) {
|
|
// shrink to omit the sampled nodes.
|
|
nptr.Resize(*h_num_runs + 1);
|
|
nidx.Resize(*h_num_runs);
|
|
|
|
std::vector<bst_node_t> leaves;
|
|
tree.WalkTree([&](bst_node_t nidx) {
|
|
if (tree[nidx].IsLeaf()) {
|
|
leaves.push_back(nidx);
|
|
}
|
|
return true;
|
|
});
|
|
CHECK_EQ(leaves.size(), n_leaf);
|
|
// Fill all the leaves that don't have any sample. This is hacky and inefficient. An
|
|
// alternative is to leave the objective to handle missing leaf, which is more messy
|
|
// as we need to take other distributed workers into account.
|
|
auto& h_nidx = nidx.HostVector();
|
|
auto& h_nptr = nptr.HostVector();
|
|
FillMissingLeaf(leaves, &h_nidx, &h_nptr);
|
|
nidx.DevicePointer();
|
|
nptr.DevicePointer();
|
|
}
|
|
CHECK_EQ(nidx.Size(), n_leaf);
|
|
CHECK_EQ(nptr.Size(), n_leaf + 1);
|
|
}
|
|
|
|
void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
|
|
std::int32_t group_idx, MetaInfo const& info,
|
|
HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
|
|
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
|
dh::device_vector<size_t> ridx;
|
|
HostDeviceVector<size_t> nptr;
|
|
HostDeviceVector<bst_node_t> nidx;
|
|
|
|
EncodeTreeLeafDevice(ctx, position, &ridx, &nptr, &nidx, *p_tree);
|
|
|
|
if (nptr.Empty()) {
|
|
std::vector<float> quantiles;
|
|
UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
|
|
}
|
|
|
|
HostDeviceVector<float> quantiles;
|
|
predt.SetDevice(ctx->gpu_id);
|
|
|
|
auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(),
|
|
{info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
|
|
CHECK_LT(group_idx, d_predt.Shape(1));
|
|
auto t_predt = d_predt.Slice(linalg::All(), group_idx);
|
|
auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
|
|
|
|
auto d_row_index = dh::ToSpan(ridx);
|
|
auto seg_beg = nptr.DevicePointer();
|
|
auto seg_end = seg_beg + nptr.Size();
|
|
auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
|
[=] XGBOOST_DEVICE(size_t i) {
|
|
float p = t_predt(d_row_index[i]);
|
|
auto y = d_labels(d_row_index[i]);
|
|
return y - p;
|
|
});
|
|
CHECK_EQ(d_labels.Shape(0), position.size());
|
|
auto val_end = val_beg + d_labels.Shape(0);
|
|
CHECK_EQ(nidx.Size() + 1, nptr.Size());
|
|
if (info.weights_.Empty()) {
|
|
common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
|
|
} else {
|
|
info.weights_.SetDevice(ctx->gpu_id);
|
|
auto d_weights = info.weights_.ConstDeviceSpan();
|
|
CHECK_EQ(d_weights.size(), d_row_index.size());
|
|
auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
|
|
common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
|
|
w_it + d_weights.size(), &quantiles);
|
|
}
|
|
|
|
UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
|
|
}
|
|
} // namespace detail
|
|
} // namespace obj
|
|
} // namespace xgboost
|