Move thread local entry into Learner. (#5396)

* Move thread local entry into Learner.

This is an attempt to workaround CUDA context issue in static variable, where
the CUDA context can be released before device vector.

* Add PredictionEntry to thread local entry.

This eliminates one copy of prediction vector.

* Don't define CUDA C API in a namespace.
This commit is contained in:
Jiaming Yuan
2020-03-07 15:37:39 +08:00
committed by GitHub
parent 1ba6706167
commit 0dd97c206b
6 changed files with 87 additions and 63 deletions

View File

@@ -105,6 +105,17 @@ class Transform {
return Span<T const> {_vec->ConstHostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
// Recursive sync host
template <typename T>
void SyncHost(const HostDeviceVector<T> *_vector) const {
_vector->ConstHostPointer();
}
template <typename Head, typename... Rest>
void SyncHost(const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->ConstHostPointer();
SyncHost(_vectors...);
}
// Recursive unpack for Shard.
template <typename T>
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
@@ -154,6 +165,7 @@ class Transform {
void LaunchCPU(Functor func, HDV*... vectors) const {
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
dmlc::OMPException omp_exc;
SyncHost(vectors...);
#pragma omp parallel for schedule(static)
for (omp_ulong idx = 0; idx < end; ++idx) {
omp_exc.Run(func, idx, UnpackHDV(vectors)...);