Move thread local entry into Learner. (#5396)

* Move thread local entry into Learner. This is an attempt to workaround CUDA context issue in static variable, where the CUDA context can be released before device vector. * Add PredictionEntry to thread local entry. This eliminates one copy of prediction vector. * Don't define CUDA C API in a namespace.
2020-03-07 15:37:39 +08:00
parent 1ba6706167
commit 0dd97c206b
6 changed files with 87 additions and 63 deletions
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -105,6 +105,17 @@ class Transform {
      return Span<T const> {_vec->ConstHostPointer(),
            static_cast<typename Span<T>::index_type>(_vec->Size())};
    }
+    // Recursive sync host
+    template <typename T>
+    void SyncHost(const HostDeviceVector<T> *_vector) const {
+      _vector->ConstHostPointer();
+    }
+    template <typename Head, typename... Rest>
+    void SyncHost(const HostDeviceVector<Head> *_vector,
+                  const HostDeviceVector<Rest> *... _vectors) const {
+      _vector->ConstHostPointer();
+      SyncHost(_vectors...);
+    }
    // Recursive unpack for Shard.
    template <typename T>
    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
@@ -154,6 +165,7 @@ class Transform {
    void LaunchCPU(Functor func, HDV*... vectors) const {
      omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
      dmlc::OMPException omp_exc;
+      SyncHost(vectors...);
 #pragma omp parallel for schedule(static)
      for (omp_ulong idx = 0; idx < end; ++idx) {
        omp_exc.Run(func, idx, UnpackHDV(vectors)...);