From 7fb3fbf577be7d00c5b020a5cab42f27143ad255 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 31 Aug 2016 07:34:10 -0400 Subject: [PATCH] impose shuffle when creating training RDD (#1531) --- .../main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 6cbfbf72c..8ebf080fd 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -70,12 +70,9 @@ object XGBoost extends Serializable { useExternalMemory: Boolean, missing: Float = Float.NaN): RDD[Booster] = { import DataUtils._ val partitionedData = { - if (numWorkers > trainingData.partitions.length) { + if (numWorkers != trainingData.partitions.length) { logger.info(s"repartitioning training set to $numWorkers partitions") trainingData.repartition(numWorkers) - } else if (numWorkers < trainingData.partitions.length) { - logger.info(s"repartitioning training set to $numWorkers partitions") - trainingData.coalesce(numWorkers) } else { trainingData }