- CUDA implementation. - Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation. - Some changes to the event loop for fixing a deadlock in CI. - Move argsort into algorithms.cuh, add support for cuda stream.
107 lines
4.6 KiB
Plaintext
107 lines
4.6 KiB
Plaintext
/**
|
|
* Copyright 2023, XGBoost Contributors
|
|
*/
|
|
#include <thrust/shuffle.h> // for shuffle
|
|
|
|
#include <memory> // for shared_ptr
|
|
|
|
#include "algorithm.cuh" // for ArgSort
|
|
#include "cuda_context.cuh" // for CUDAContext
|
|
#include "device_helpers.cuh"
|
|
#include "random.h"
|
|
#include "xgboost/base.h" // for bst_feature_t
|
|
#include "xgboost/context.h" // for Context
|
|
#include "xgboost/host_device_vector.h" // for HostDeviceVector
|
|
|
|
namespace xgboost::common::cuda_impl {
|
|
// GPU implementation for sampling without replacement, see the CPU version for references.
|
|
void WeightedSamplingWithoutReplacement(Context const *ctx, common::Span<bst_feature_t const> array,
|
|
common::Span<float const> weights,
|
|
common::Span<bst_feature_t> results,
|
|
HostDeviceVector<bst_feature_t> *sorted_idx,
|
|
GlobalRandomEngine *grng) {
|
|
CUDAContext const *cuctx = ctx->CUDACtx();
|
|
CHECK_EQ(array.size(), weights.size());
|
|
// Sampling keys
|
|
dh::caching_device_vector<float> keys(weights.size());
|
|
|
|
auto d_keys = dh::ToSpan(keys);
|
|
|
|
auto seed = (*grng)();
|
|
constexpr auto kEps = kRtEps; // avoid CUDA compilation error
|
|
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), array.size(),
|
|
[=] XGBOOST_DEVICE(std::size_t i) {
|
|
thrust::default_random_engine rng;
|
|
rng.seed(seed);
|
|
rng.discard(i);
|
|
thrust::uniform_real_distribution<float> dist;
|
|
|
|
auto w = std::max(weights[i], kEps);
|
|
auto u = dist(rng);
|
|
auto k = std::log(u) / w;
|
|
d_keys[i] = k;
|
|
});
|
|
// Allocate buffer for sorted index.
|
|
auto d_idx = dh::LazyResize(ctx, sorted_idx, keys.size());
|
|
|
|
ArgSort<false>(ctx, d_keys, d_idx);
|
|
|
|
// Filter the result according to sorted index.
|
|
auto it = thrust::make_permutation_iterator(dh::tbegin(array), dh::tbegin(d_idx));
|
|
// |array| == |weights| == |keys| == |sorted_idx| >= |results|
|
|
for (auto size : {array.size(), weights.size(), keys.size()}) {
|
|
CHECK_EQ(size, d_idx.size());
|
|
}
|
|
CHECK_GE(array.size(), results.size());
|
|
thrust::copy_n(cuctx->CTP(), it, results.size(), dh::tbegin(results));
|
|
}
|
|
|
|
void SampleFeature(Context const *ctx, bst_feature_t n_features,
|
|
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
|
|
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
|
|
HostDeviceVector<float> const &feature_weights,
|
|
HostDeviceVector<float> *weight_buffer,
|
|
HostDeviceVector<bst_feature_t> *idx_buffer, GlobalRandomEngine *grng) {
|
|
CUDAContext const *cuctx = ctx->CUDACtx();
|
|
auto &new_features = *p_new_features;
|
|
new_features.SetDevice(ctx->Device());
|
|
p_features->SetDevice(ctx->Device());
|
|
CHECK_LE(n_features, p_features->Size());
|
|
|
|
if (!feature_weights.Empty()) {
|
|
CHECK_LE(p_features->Size(), feature_weights.Size());
|
|
idx_buffer->SetDevice(ctx->Device());
|
|
feature_weights.SetDevice(ctx->Device());
|
|
|
|
auto d_old_features = p_features->DeviceSpan();
|
|
auto d_weight_buffer = dh::LazyResize(ctx, weight_buffer, d_old_features.size());
|
|
// Filter weights according to the existing feature index.
|
|
auto d_feature_weight = feature_weights.ConstDeviceSpan();
|
|
auto it = thrust::make_permutation_iterator(dh::tcbegin(d_feature_weight),
|
|
dh::tcbegin(d_old_features));
|
|
thrust::copy_n(cuctx->CTP(), it, d_old_features.size(), dh::tbegin(d_weight_buffer));
|
|
new_features.Resize(n_features);
|
|
WeightedSamplingWithoutReplacement(ctx, d_old_features, d_weight_buffer,
|
|
new_features.DeviceSpan(), idx_buffer, grng);
|
|
} else {
|
|
new_features.Resize(p_features->Size());
|
|
new_features.Copy(*p_features);
|
|
auto d_feat = new_features.DeviceSpan();
|
|
thrust::default_random_engine rng;
|
|
rng.seed((*grng)());
|
|
thrust::shuffle(cuctx->CTP(), dh::tbegin(d_feat), dh::tend(d_feat), rng);
|
|
new_features.Resize(n_features);
|
|
}
|
|
|
|
auto d_new_features = new_features.DeviceSpan();
|
|
thrust::sort(cuctx->CTP(), dh::tbegin(d_new_features), dh::tend(d_new_features));
|
|
}
|
|
|
|
void InitFeatureSet(Context const *ctx,
|
|
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features) {
|
|
CUDAContext const *cuctx = ctx->CUDACtx();
|
|
auto d_features = p_features->DeviceSpan();
|
|
thrust::sequence(cuctx->CTP(), dh::tbegin(d_features), dh::tend(d_features), 0);
|
|
}
|
|
} // namespace xgboost::common::cuda_impl
|