diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 7271dcb5e..2a7034706 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1004,7 +1004,7 @@ class AllReducer {
 
 template <typename T, typename FunctionT>
 void ExecuteShards(std::vector<T> *shards, FunctionT f) {
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
   for (int shard = 0; shard < shards->size(); ++shard) {
     f(shards->at(shard));
   }
@@ -1023,7 +1023,7 @@ void ExecuteShards(std::vector<T> *shards, FunctionT f) {
 
 template <typename T, typename FunctionT>
 void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
   for (int shard = 0; shard < shards->size(); ++shard) {
     f(shard, shards->at(shard));
   }
@@ -1045,7 +1045,7 @@ void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
 template <typename ReduceT,typename T, typename FunctionT>
 ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
   std::vector<ReduceT> sums(shards->size());
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) if (shards->size() > 1)
   for (int shard = 0; shard < shards->size(); ++shard) {
     sums[shard] = f(shards->at(shard));
   }
diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc
index 75af216a1..98d7b9147 100644
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@@ -55,9 +55,7 @@ class RegLossObj : public ObjFunction {
     avx::Float8 scale(param_.scale_pos_weight);
 
     const omp_ulong remainder = n % 8;
-    int nthread = omp_get_max_threads();
-    // Use a maximum of 8 threads
-#pragma omp parallel for schedule(static) num_threads(std::min(8, nthread))
+#pragma omp parallel for schedule(static)
     for (omp_ulong i = 0; i < n - remainder; i += 8) {
       avx::Float8 y(&info.labels_[i]);
       avx::Float8 p = Loss::PredTransform(avx::Float8(&preds_h[i]));
@@ -77,9 +75,6 @@ class RegLossObj : public ObjFunction {
       gpair[i] = GradientPair(Loss::FirstOrderGradient(p, y) * w,
                            Loss::SecondOrderGradient(p, y) * w);
     }
-
-    // Reset omp max threads
-    omp_set_num_threads(nthread);
   }
   const char *DefaultEvalMetric() const override {
     return Loss::DefaultEvalMetric();
diff --git a/src/objective/regression_obj_gpu.cu b/src/objective/regression_obj_gpu.cu
index 2dfc691e3..5d7c21ebd 100644
--- a/src/objective/regression_obj_gpu.cu
+++ b/src/objective/regression_obj_gpu.cu
@@ -136,7 +136,7 @@ class GPURegLossObj : public ObjFunction {
     }
 
     // run the kernel
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
     for (int i = 0; i < devices_.Size(); ++i) {
       int d = devices_[i];
       dh::safe_cuda(cudaSetDevice(d));
@@ -173,7 +173,7 @@ class GPURegLossObj : public ObjFunction {
   }
 
   void PredTransformDevice(HostDeviceVector<float>* preds) {
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
     for (int i = 0; i < devices_.Size(); ++i) {
       int d = devices_[i];
       dh::safe_cuda(cudaSetDevice(d));