[jvm-packages] enable deterministic repartitioning when checkpoint is enabled (#4807)

* do reparititoning in DataUtil

* keep previous behavior of partitioning without checkpoint

* deterministic repartitioning

* change
This commit is contained in:
Nan Zhu
2019-09-19 15:21:05 -07:00
committed by GitHub
parent 277e25797b
commit fc8c9b0521
6 changed files with 204 additions and 28 deletions

View File

@@ -0,0 +1,82 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.spark
import org.scalatest.FunSuite
import org.apache.spark.sql.functions._
class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite with PerTest {
test("perform deterministic partitioning when checkpointInternal and" +
" checkpointPath is set (Classifier)") {
val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
val paramMap = Map("eta" -> "1", "max_depth" -> 2,
"objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
"checkpoint_interval" -> 2, "num_workers" -> numWorkers)
val xgbClassifier = new XGBoostClassifier(paramMap)
assert(xgbClassifier.needDeterministicRepartitioning)
}
test("perform deterministic partitioning when checkpointInternal and" +
" checkpointPath is set (Regressor)") {
val tmpPath = createTmpFolder("model1").toAbsolutePath.toString
val paramMap = Map("eta" -> "1", "max_depth" -> 2,
"objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
"checkpoint_interval" -> 2, "num_workers" -> numWorkers)
val xgbRegressor = new XGBoostRegressor(paramMap)
assert(xgbRegressor.needDeterministicRepartitioning)
}
test("deterministic partitioning takes effect with various parts of data") {
val trainingDF = buildDataFrame(Classification.train)
// the test idea is that, we apply a chain of repartitions over trainingDFs but they
// have to produce the identical RDDs
val transformedDFs = (1 until 6).map(shuffleCount => {
var resultDF = trainingDF
for (i <- 0 until shuffleCount) {
resultDF = resultDF.repartition(numWorkers)
}
resultDF
})
val transformedRDDs = transformedDFs.map(df => DataUtils.convertDataFrameToXGBLabeledPointRDDs(
col("label"),
col("features"),
lit(1.0),
lit(Float.NaN),
None,
numWorkers,
deterministicPartition = true,
df
).head)
val resultsMaps = transformedRDDs.map(rdd => rdd.mapPartitionsWithIndex {
case (partitionIndex, labelPoints) =>
Iterator((partitionIndex, labelPoints.toList))
}.collect().toMap)
resultsMaps.foldLeft(resultsMaps.head) { case (map1, map2) =>
assert(map1.keys.toSet === map2.keys.toSet)
for ((parIdx, labeledPoints) <- map1) {
val sortedA = labeledPoints.sortBy(_.hashCode())
val sortedB = map2(parIdx).sortBy(_.hashCode())
assert(sortedA.length === sortedB.length)
assert(sortedA.indices.forall(idx =>
sortedA(idx).values.toSet === sortedB(idx).values.toSet))
}
map2
}
}
}