[jvm-packages] fix potential unit test suites aborted issue (#6373)

* fix race conditio

* code cleaning

rm pom.xml-e

* clean again

* fix compilation issue

* recover

* avoid using getOrCreate

* interrupt zombie threads

* safe guard

* fix deadlock

* Update SparkParallelismTracker.scala
This commit is contained in:
Nan Zhu
2020-11-17 10:59:26 -08:00
committed by GitHub
parent e426b6e040
commit 4d1d5d4010
3 changed files with 28 additions and 5 deletions

View File

@@ -45,12 +45,26 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
override def beforeEach(): Unit = getOrCreateSession
override def afterEach() {
synchronized {
TaskFailedListener.sparkContextShutdownLock.synchronized {
if (currentSession != null) {
// this synchronization is mostly for the tests involving SparkContext shutdown
// for unit test involving the sparkContext shutdown there are two different events sequence
// 1. SparkContext killer is executed before afterEach, in this case, before SparkContext
// is fully stopped, afterEach() will block at the following code block
// 2. SparkContext killer is executed afterEach, in this case, currentSession.stop() in will
// block to wait for all msgs in ListenerBus get processed. Because currentSession.stop()
// has been called, SparkContext killer will not take effect
while (TaskFailedListener.killerStarted) {
TaskFailedListener.sparkContextShutdownLock.wait()
}
currentSession.stop()
cleanExternalCache(currentSession.sparkContext.appName)
currentSession = null
}
if (TaskFailedListener.sparkContextKiller != null) {
TaskFailedListener.sparkContextKiller.interrupt()
TaskFailedListener.sparkContextKiller = null
}
TaskFailedListener.killerStarted = false
}
}

View File

@@ -114,6 +114,7 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
// assume all tasks throw exception almost same time
// 100ms should be enough to exhaust all retries
assert(waitAndCheckSparkShutdown(100) == true)
TaskFailedListener.killerStarted = false
}
}