Fix non-openmp build. (#5566)

* Add test to Jenkins. * Fix threading utils tests. * Require thread library.
2020-04-20 12:16:38 +08:00 · 2020-04-20 12:16:38 +08:00 · ccd30e4491
commit ccd30e4491
parent b2827a80e1
9 changed files with 52 additions and 24 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -95,6 +95,8 @@ if (USE_CUDA)
  message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
 endif (USE_CUDA)

+find_package(Threads REQUIRED)
+
 if (USE_OPENMP)
  if (APPLE)
    # Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
@ -128,7 +130,7 @@ foreach(lib rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
  # Explicitly link dmlc to rabit, so that configured header (build_config.h)
  # from dmlc is correctly applied to rabit.
  if (TARGET ${lib})
-    target_link_libraries(${lib} dmlc)
+    target_link_libraries(${lib} dmlc ${CMAKE_THREAD_LIBS_INIT})
  endif (TARGET ${lib})
 endforeach()

@ -138,6 +140,7 @@ if (R_LIB)
 endif (R_LIB)

 # core xgboost
+list(APPEND LINKED_LIBRARIES_PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)
--- a/19
+++ b/19
@ -63,6 +63,7 @@ pipeline {
          parallel ([
            'build-cpu': { BuildCPU() },
            'build-cpu-rabit-mock': { BuildCPUMock() },
+            'build-cpu-non-omp': { BuildCPUNonOmp() },
            'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
            'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@ -221,6 +222,22 @@ def BuildCPUMock() {
  }
 }

+def BuildCPUNonOmp() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build CPU without OpenMP"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_OPENMP=OFF
+    """
+    echo "Running Non-OpenMP C++ test..."
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
+    """
+    deleteDir()
+  }
+}

 def BuildCUDA(args) {
  node('linux && cpu') {
@ -259,7 +276,7 @@ def BuildJVMPackages(args) {
    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_packages.sh ${args.spark_version}
    """
    echo 'Stashing XGBoost4J JAR...'
-    stash name: 'xgboost4j_jar', includes: 'jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar'
+    stash name: 'xgboost4j_jar', includes: "jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar"
    deleteDir()
  }
 }
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -88,12 +88,15 @@ if (XGBOOST_BUILTIN_PREFETCH_PRESENT)
    -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
 endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)

+find_package(Threads REQUIRED)
+list(APPEND SRC_LIBS Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
+
 if (USE_OPENMP OR USE_CUDA)  # CUDA requires OpenMP
  find_package(OpenMP REQUIRED)
  list(APPEND SRC_LIBS OpenMP::OpenMP_CXX)
-  set(LINKED_LIBRARIES_PRIVATE "${LINKED_LIBRARIES_PRIVATE};${SRC_LIBS}" PARENT_SCOPE)
  target_link_libraries(objxgboost PRIVATE OpenMP::OpenMP_CXX)
 endif (USE_OPENMP OR USE_CUDA)
+set(LINKED_LIBRARIES_PRIVATE "${LINKED_LIBRARIES_PRIVATE};${SRC_LIBS}" PARENT_SCOPE)

 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@ -227,8 +227,7 @@ class ColumnMatrix {
    /* missing values make sense only for column with type kDenseColumn,
       and if no missing values were observed it could be handled much faster. */
    if (noMissingValues) {
-      const int32_t nthread = omp_get_max_threads();  // NOLINT
-      #pragma omp parallel for num_threads(nthread)
+#pragma omp parallel for num_threads(omp_get_max_threads())
      for (omp_ulong rid = 0; rid < nrow; ++rid) {
        const size_t ibegin = rid*nfeature;
        const size_t iend = (rid+1)*nfeature;
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@ -110,10 +110,12 @@ class BlockedSpace2d {

 // Wrapper to implement nested parallelism with simple omp parallel for
 template<typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, const int nthreads, Func func) {
+void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
  const size_t num_blocks_in_space = space.Size();
+  nthreads = std::min(nthreads, omp_get_max_threads());
+  nthreads = std::max(nthreads, 1);

-  #pragma omp parallel num_threads(nthreads)
+#pragma omp parallel num_threads(nthreads)
  {
    size_t tid = omp_get_thread_num();
    size_t chunck_size = num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@ -189,12 +189,7 @@ struct EvalAuc : public Metric {
          rec[j - gptr[group_id]] = {h_preds[j], j};
        }

-        if (omp_in_parallel()) {
-          std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-        } else {
        XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
-        }
-
        // calculate AUC
        double sum_pospair = 0.0;
        double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
@ -557,11 +552,7 @@ struct EvalAucPR : public Metric {
          continue;
        }

-        if (omp_in_parallel()) {
-          std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-        } else {
        XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
-        }

        // calculate AUC
        double tp = 0.0, prevtp = 0.0, fp = 0.0, prevfp = 0.0, h = 0.0, a = 0.0, b = 0.0;
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@ -69,8 +69,8 @@ set_target_properties(
 target_link_libraries(testxgboost
  PRIVATE
  ${GTEST_LIBRARIES}
-  ${LINKED_LIBRARIES_PRIVATE}
-  OpenMP::OpenMP_CXX)
+  ${LINKED_LIBRARIES_PRIVATE})
+
 target_compile_definitions(testxgboost PRIVATE ${XGBOOST_DEFINITIONS})
 set_output_directory(testxgboost ${xgboost_BINARY_DIR})

--- a/tests/cpp/common/test_threading_utils.cc
+++ b/tests/cpp/common/test_threading_utils.cc
@ -38,7 +38,10 @@ TEST(ParallelFor2d, Test) {
      return kDim2;
  }, kGrainSize);

-  ParallelFor2d(space, 4, [&](size_t i, Range1d r) {
+  auto old = omp_get_max_threads();
+  omp_set_num_threads(4);
+
+  ParallelFor2d(space, omp_get_max_threads(), [&](size_t i, Range1d r) {
    for (auto j = r.begin(); j < r.end(); ++j) {
      matrix[i*kDim2 + j] += 1;
    }
@ -47,12 +50,17 @@ TEST(ParallelFor2d, Test) {
  for (size_t i = 0; i < kDim1 * kDim2; i++) {
    ASSERT_EQ(matrix[i], 1);
  }
+
+  omp_set_num_threads(old);
 }

 TEST(ParallelFor2dNonUniform, Test) {
  constexpr size_t kDim1 = 5;
  constexpr size_t kGrainSize = 256;

+  auto old = omp_get_max_threads();
+  omp_set_num_threads(4);
+
  // here are quite non-uniform distribution in space
  // but ParallelFor2d should split them by blocks with max size = kGrainSize
  // and process in balanced manner (optimal performance)
@ -66,7 +74,7 @@ TEST(ParallelFor2dNonUniform, Test) {
    working_space[i].resize(dim2[i], 0);
  }

-  ParallelFor2d(space, 4, [&](size_t i, Range1d r) {
+  ParallelFor2d(space, omp_get_max_threads(), [&](size_t i, Range1d r) {
    for (auto j = r.begin(); j < r.end(); ++j) {
      working_space[i][j] += 1;
    }
@ -77,6 +85,8 @@ TEST(ParallelFor2dNonUniform, Test) {
      ASSERT_EQ(working_space[i][j], 1);
    }
  }
+
+  omp_set_num_threads(old);
 }

 }  // namespace common
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@ -301,9 +301,12 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
    batch_count++;
    row_count += batch.Size();
  }
+#if defined(_OPENMP)
  EXPECT_GE(batch_count, 2);
  EXPECT_EQ(row_count, dmat->Info().num_row_);
-
+#else
+#warning "External memory doesn't work with Non-OpenMP build "
+#endif  // defined(_OPENMP)
  return dmat;
 }