Implement column sampler in CUDA. (#9785)

- CUDA implementation.
- Extract the broadcasting logic, we will need the context parameter after revamping the collective implementation.
- Some changes to the event loop for fixing a deadlock in CI.
- Move argsort into algorithms.cuh, add support for cuda stream.
This commit is contained in:
Jiaming Yuan
2023-11-17 04:29:08 +08:00
committed by GitHub
parent 178cfe70a8
commit fedd9674c8
20 changed files with 447 additions and 232 deletions

106
src/common/random.cu Normal file
View File

@@ -0,0 +1,106 @@
/**
* Copyright 2023, XGBoost Contributors
*/
#include <thrust/shuffle.h> // for shuffle
#include <memory> // for shared_ptr
#include "algorithm.cuh" // for ArgSort
#include "cuda_context.cuh" // for CUDAContext
#include "device_helpers.cuh"
#include "random.h"
#include "xgboost/base.h" // for bst_feature_t
#include "xgboost/context.h" // for Context
#include "xgboost/host_device_vector.h" // for HostDeviceVector
namespace xgboost::common::cuda_impl {
// GPU implementation for sampling without replacement, see the CPU version for references.
void WeightedSamplingWithoutReplacement(Context const *ctx, common::Span<bst_feature_t const> array,
common::Span<float const> weights,
common::Span<bst_feature_t> results,
HostDeviceVector<bst_feature_t> *sorted_idx,
GlobalRandomEngine *grng) {
CUDAContext const *cuctx = ctx->CUDACtx();
CHECK_EQ(array.size(), weights.size());
// Sampling keys
dh::caching_device_vector<float> keys(weights.size());
auto d_keys = dh::ToSpan(keys);
auto seed = (*grng)();
constexpr auto kEps = kRtEps; // avoid CUDA compilation error
thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), array.size(),
[=] XGBOOST_DEVICE(std::size_t i) {
thrust::default_random_engine rng;
rng.seed(seed);
rng.discard(i);
thrust::uniform_real_distribution<float> dist;
auto w = std::max(weights[i], kEps);
auto u = dist(rng);
auto k = std::log(u) / w;
d_keys[i] = k;
});
// Allocate buffer for sorted index.
auto d_idx = dh::LazyResize(ctx, sorted_idx, keys.size());
ArgSort<false>(ctx, d_keys, d_idx);
// Filter the result according to sorted index.
auto it = thrust::make_permutation_iterator(dh::tbegin(array), dh::tbegin(d_idx));
// |array| == |weights| == |keys| == |sorted_idx| >= |results|
for (auto size : {array.size(), weights.size(), keys.size()}) {
CHECK_EQ(size, d_idx.size());
}
CHECK_GE(array.size(), results.size());
thrust::copy_n(cuctx->CTP(), it, results.size(), dh::tbegin(results));
}
void SampleFeature(Context const *ctx, bst_feature_t n_features,
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
HostDeviceVector<float> const &feature_weights,
HostDeviceVector<float> *weight_buffer,
HostDeviceVector<bst_feature_t> *idx_buffer, GlobalRandomEngine *grng) {
CUDAContext const *cuctx = ctx->CUDACtx();
auto &new_features = *p_new_features;
new_features.SetDevice(ctx->Device());
p_features->SetDevice(ctx->Device());
CHECK_LE(n_features, p_features->Size());
if (!feature_weights.Empty()) {
CHECK_LE(p_features->Size(), feature_weights.Size());
idx_buffer->SetDevice(ctx->Device());
feature_weights.SetDevice(ctx->Device());
auto d_old_features = p_features->DeviceSpan();
auto d_weight_buffer = dh::LazyResize(ctx, weight_buffer, d_old_features.size());
// Filter weights according to the existing feature index.
auto d_feature_weight = feature_weights.ConstDeviceSpan();
auto it = thrust::make_permutation_iterator(dh::tcbegin(d_feature_weight),
dh::tcbegin(d_old_features));
thrust::copy_n(cuctx->CTP(), it, d_old_features.size(), dh::tbegin(d_weight_buffer));
new_features.Resize(n_features);
WeightedSamplingWithoutReplacement(ctx, d_old_features, d_weight_buffer,
new_features.DeviceSpan(), idx_buffer, grng);
} else {
new_features.Resize(p_features->Size());
new_features.Copy(*p_features);
auto d_feat = new_features.DeviceSpan();
thrust::default_random_engine rng;
rng.seed((*grng)());
thrust::shuffle(cuctx->CTP(), dh::tbegin(d_feat), dh::tend(d_feat), rng);
new_features.Resize(n_features);
}
auto d_new_features = new_features.DeviceSpan();
thrust::sort(cuctx->CTP(), dh::tbegin(d_new_features), dh::tend(d_new_features));
}
void InitFeatureSet(Context const *ctx,
std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features) {
CUDAContext const *cuctx = ctx->CUDACtx();
auto d_features = p_features->DeviceSpan();
thrust::sequence(cuctx->CTP(), dh::tbegin(d_features), dh::tend(d_features), 0);
}
} // namespace xgboost::common::cuda_impl