From 3ee725e3bb345e31793ec84551911a89f546d30e Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 17 May 2018 10:59:22 +1200
Subject: [PATCH] Add cuda forwards compatibility (#3316)

---
 CMakeLists.txt                          | 13 +++++--------
 cmake/Utils.cmake                       | 15 +++++++++++++++
 tests/cpp/common/test_device_helpers.cu | 25 -------------------------
 3 files changed, 20 insertions(+), 33 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53ddae1c0..39811f10b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,8 @@ option(USE_NCCL "Build using NCCL for multi-GPU. Also requires USE_CUDA")
 option(JVM_BINDINGS "Build JVM bindings" OFF)
 option(GOOGLE_TEST "Build google tests" OFF)
 option(R_LIB "Build shared library for R package" OFF)
-set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
-  "Space separated list of compute versions to be built against")
+set(GPU_COMPUTE_VER "" CACHE STRING
+  "Space separated list of compute versions to be built against, e.g. '35 61'")
 
 # Deprecation warning
 if(PLUGIN_UPDATER_GPU)
@@ -122,16 +122,13 @@ if(USE_CUDA)
     add_definitions(-DXGBOOST_USE_NCCL)
   endif()
 
-  if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
-    message("CUDA 9.0 detected, adding Volta compute capability (7.0).")
-    set(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70")
-  endif()
-  
   set(GENCODE_FLAGS "")
   format_gencode_flags("${GPU_COMPUTE_VER}" GENCODE_FLAGS)
+  message("cuda architecture flags: ${GENCODE_FLAGS}")
+
   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo;")
   if(NOT MSVC)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -Xcompiler -Werror; -std=c++11")
   endif()
 
   if(USE_NCCL)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 7919c1959..c0fcb9391 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -54,10 +54,25 @@ function(set_default_configuration_release)
 	endif()
 endfunction(set_default_configuration_release)
 
+# Generate nvcc compiler flags given a list of architectures
+# Also generates PTX for the most recent architecture for forwards compatibility
 function(format_gencode_flags flags out)
+  # Set up architecture flags
+  if(NOT flags) 
+    if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
+      set(flags "35;50;52;60;61;70")
+    else()
+      set(flags "35;50;52;60;61")
+    endif()
+  endif()
+  # Generate SASS
   foreach(ver ${flags})
     set(${out} "${${out}}-gencode arch=compute_${ver},code=sm_${ver};")
   endforeach()
+  # Generate PTX for last architecture
+  list(GET flags -1 ver)
+  set(${out} "${${out}}-gencode arch=compute_${ver},code=compute_${ver};")
+  
   set(${out} "${${out}}" PARENT_SCOPE)
 endfunction(format_gencode_flags flags)
 
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 64146b2b3..cb9bc1d46 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -25,31 +25,6 @@ void CreateTestData(xgboost::bst_uint num_rows, int max_row_size,
   }
 }
 
-void SpeedTest() {
-  int num_rows = 1000000;
-  int max_row_size = 100;
-  dh::CubMemory temp_memory;
-  thrust::host_vector<int> h_row_ptr;
-  thrust::host_vector<xgboost::bst_uint> h_rows;
-  CreateTestData(num_rows, max_row_size, &h_row_ptr, &h_rows);
-  thrust::device_vector<int> row_ptr = h_row_ptr;
-  thrust::device_vector<int> output_row(h_rows.size());
-  auto d_output_row = output_row.data();
-
-  xgboost::common::Timer t;
-  dh::TransformLbs(
-      0, &temp_memory, h_rows.size(), dh::Raw(row_ptr), row_ptr.size() - 1,
-      false,
-      [=] __device__(size_t idx, size_t ridx) { d_output_row[idx] = ridx; });
-
-  dh::safe_cuda(cudaDeviceSynchronize());
-  double time = t.ElapsedSeconds();
-  const int mb_size = 1048576;
-  size_t size = (sizeof(int) * h_rows.size()) / mb_size;
-  printf("size: %llumb, time: %fs, bandwidth: %fmb/s\n", size, time,
-         size / time);
-}
-
 void TestLbs() {
   srand(17);
   dh::CubMemory temp_memory;