/** * Copyright 2022-2023 by XGBoost Contributors */ #include #include // std::int32_t #include // NOLINT #include "../common/cuda_context.cuh" // CUDAContext #include "../common/device_helpers.cuh" #include "../common/stats.cuh" #include "adaptive.h" #include "xgboost/context.h" namespace xgboost { namespace obj { namespace detail { void EncodeTreeLeafDevice(Context const* ctx, common::Span position, dh::device_vector* p_ridx, HostDeviceVector* p_nptr, HostDeviceVector* p_nidx, RegTree const& tree) { // copy position to buffer dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); auto cuctx = ctx->CUDACtx(); size_t n_samples = position.size(); dh::device_vector sorted_position(position.size()); dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(), position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream())); p_ridx->resize(position.size()); dh::Iota(dh::ToSpan(*p_ridx)); // sort row index according to node index thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(), sorted_position.begin() + n_samples, p_ridx->begin()); size_t beg_pos = thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(), [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) - sorted_position.cbegin(); if (beg_pos == sorted_position.size()) { auto& leaf = p_nidx->HostVector(); tree.WalkTree([&](bst_node_t nidx) { if (tree[nidx].IsLeaf()) { leaf.push_back(nidx); } return true; }); return; } size_t n_leaf = tree.GetNumLeaves(); size_t max_n_unique = n_leaf; dh::caching_device_vector counts_out(max_n_unique + 1, 0); auto d_counts_out = dh::ToSpan(counts_out).subspan(0, max_n_unique); auto d_num_runs_out = dh::ToSpan(counts_out).subspan(max_n_unique, 1); dh::caching_device_vector unique_out(max_n_unique, 0); auto d_unique_out = dh::ToSpan(unique_out); size_t nbytes{0}; auto begin_it = sorted_position.begin() + beg_pos; dh::safe_cuda(cub::DeviceRunLengthEncode::Encode( nullptr, nbytes, begin_it, unique_out.data().get(), counts_out.data().get(), d_num_runs_out.data(), n_samples - beg_pos, ctx->CUDACtx()->Stream())); dh::TemporaryArray temp(nbytes); dh::safe_cuda(cub::DeviceRunLengthEncode::Encode( temp.data().get(), nbytes, begin_it, unique_out.data().get(), counts_out.data().get(), d_num_runs_out.data(), n_samples - beg_pos, ctx->CUDACtx()->Stream())); dh::PinnedMemory pinned_pool; auto pinned = pinned_pool.GetSpan(sizeof(size_t) + sizeof(bst_node_t)); dh::CUDAStream copy_stream; size_t* h_num_runs = reinterpret_cast(pinned.subspan(0, sizeof(size_t)).data()); dh::CUDAEvent e; e.Record(cuctx->Stream()); copy_stream.View().Wait(e); // flag for whether there's ignored position bst_node_t* h_first_unique = reinterpret_cast(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data()); dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t), cudaMemcpyDeviceToHost, copy_stream.View())); dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t), cudaMemcpyDeviceToHost, copy_stream.View())); /** * copy node index (leaf index) */ auto& nidx = *p_nidx; auto& nptr = *p_nptr; nidx.SetDevice(ctx->gpu_id); nidx.Resize(n_leaf); auto d_node_idx = nidx.DeviceSpan(); nptr.SetDevice(ctx->gpu_id); nptr.Resize(n_leaf + 1, 0); auto d_node_ptr = nptr.DeviceSpan(); dh::LaunchN(n_leaf, [=] XGBOOST_DEVICE(size_t i) { if (i >= d_num_runs_out[0]) { // d_num_runs_out <= max_n_unique // this omits all the leaf that are empty. A leaf can be empty when there's // missing data, which can be caused by sparse input and distributed training. return; } d_node_idx[i] = d_unique_out[i]; d_node_ptr[i + 1] = d_counts_out[i]; if (i == 0) { d_node_ptr[0] = beg_pos; } }); thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr), dh::tbegin(d_node_ptr)); copy_stream.View().Sync(); CHECK_GT(*h_num_runs, 0); CHECK_LE(*h_num_runs, n_leaf); if (*h_num_runs < n_leaf) { // shrink to omit the sampled nodes. nptr.Resize(*h_num_runs + 1); nidx.Resize(*h_num_runs); std::vector leaves; tree.WalkTree([&](bst_node_t nidx) { if (tree[nidx].IsLeaf()) { leaves.push_back(nidx); } return true; }); CHECK_EQ(leaves.size(), n_leaf); // Fill all the leaves that don't have any sample. This is hacky and inefficient. An // alternative is to leave the objective to handle missing leaf, which is more messy // as we need to take other distributed workers into account. auto& h_nidx = nidx.HostVector(); auto& h_nptr = nptr.HostVector(); FillMissingLeaf(leaves, &h_nidx, &h_nptr); nidx.DevicePointer(); nptr.DevicePointer(); } CHECK_EQ(nidx.Size(), n_leaf); CHECK_EQ(nptr.Size(), n_leaf + 1); } void UpdateTreeLeafDevice(Context const* ctx, common::Span position, std::int32_t group_idx, MetaInfo const& info, HostDeviceVector const& predt, float alpha, RegTree* p_tree) { dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::device_vector ridx; HostDeviceVector nptr; HostDeviceVector nidx; EncodeTreeLeafDevice(ctx, position, &ridx, &nptr, &nidx, *p_tree); if (nptr.Empty()) { std::vector quantiles; UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree); } HostDeviceVector quantiles; predt.SetDevice(ctx->gpu_id); auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(), {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id); CHECK_LT(group_idx, d_predt.Shape(1)); auto t_predt = d_predt.Slice(linalg::All(), group_idx); auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx)); auto d_row_index = dh::ToSpan(ridx); auto seg_beg = nptr.DevicePointer(); auto seg_end = seg_beg + nptr.Size(); auto val_beg = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) { float p = t_predt(d_row_index[i]); auto y = d_labels(d_row_index[i]); return y - p; }); CHECK_EQ(d_labels.Shape(0), position.size()); auto val_end = val_beg + d_labels.Shape(0); CHECK_EQ(nidx.Size() + 1, nptr.Size()); if (info.weights_.Empty()) { common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles); } else { info.weights_.SetDevice(ctx->gpu_id); auto d_weights = info.weights_.ConstDeviceSpan(); CHECK_EQ(d_weights.size(), d_row_index.size()); auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index)); common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it, w_it + d_weights.size(), &quantiles); } UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree); } } // namespace detail } // namespace obj } // namespace xgboost