[jvm-packages] fix potential unit test suites aborted issue (#6373)

* fix race conditio

* code cleaning

rm pom.xml-e

* clean again

* fix compilation issue

* recover

* avoid using getOrCreate

* interrupt zombie threads

* safe guard

* fix deadlock

* Update SparkParallelismTracker.scala
This commit is contained in:
Nan Zhu 2020-11-17 10:59:26 -08:00 committed by GitHub
parent e426b6e040
commit 4d1d5d4010
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 28 additions and 5 deletions

View File

@ -146,22 +146,30 @@ class TaskFailedListener(killSparkContext: Boolean = true) extends SparkListener
object TaskFailedListener {
var killerStarted = false
var killerStarted: Boolean = false
var sparkContextKiller: Thread = _
val sparkContextShutdownLock = new AnyRef
private def startedSparkContextKiller(): Unit = this.synchronized {
if (!killerStarted) {
killerStarted = true
// Spark does not allow ListenerThread to shutdown SparkContext so that we have to do it
// in a separate thread
val sparkContextKiller = new Thread() {
sparkContextKiller = new Thread() {
override def run(): Unit = {
LiveListenerBus.withinListenerThread.withValue(false) {
SparkContext.getOrCreate().stop()
sparkContextShutdownLock.synchronized {
SparkContext.getActive.foreach(_.stop())
killerStarted = false
sparkContextShutdownLock.notify()
}
}
}
}
sparkContextKiller.setDaemon(true)
sparkContextKiller.start()
killerStarted = true
}
}
}

View File

@ -45,12 +45,26 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
override def beforeEach(): Unit = getOrCreateSession
override def afterEach() {
synchronized {
TaskFailedListener.sparkContextShutdownLock.synchronized {
if (currentSession != null) {
// this synchronization is mostly for the tests involving SparkContext shutdown
// for unit test involving the sparkContext shutdown there are two different events sequence
// 1. SparkContext killer is executed before afterEach, in this case, before SparkContext
// is fully stopped, afterEach() will block at the following code block
// 2. SparkContext killer is executed afterEach, in this case, currentSession.stop() in will
// block to wait for all msgs in ListenerBus get processed. Because currentSession.stop()
// has been called, SparkContext killer will not take effect
while (TaskFailedListener.killerStarted) {
TaskFailedListener.sparkContextShutdownLock.wait()
}
currentSession.stop()
cleanExternalCache(currentSession.sparkContext.appName)
currentSession = null
}
if (TaskFailedListener.sparkContextKiller != null) {
TaskFailedListener.sparkContextKiller.interrupt()
TaskFailedListener.sparkContextKiller = null
}
TaskFailedListener.killerStarted = false
}
}

View File

@ -114,6 +114,7 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
// assume all tasks throw exception almost same time
// 100ms should be enough to exhaust all retries
assert(waitAndCheckSparkShutdown(100) == true)
TaskFailedListener.killerStarted = false
}
}