[jvm-packages] better fix for shutdown applications (#4108)
* intentionally failed task * throw exception * more * stop sparkcontext directly * stop from another thread * new scope * use a new thread * daemon threads * don't join the killer thread * remove injected errors * add comments
This commit is contained in:
parent
017c97b8ce
commit
05243642bb
@ -20,7 +20,7 @@ import java.net.URL
|
|||||||
|
|
||||||
import org.apache.commons.logging.LogFactory
|
import org.apache.commons.logging.LogFactory
|
||||||
|
|
||||||
import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorRemoved, SparkListenerTaskEnd}
|
import org.apache.spark.scheduler._
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.concurrent.ExecutionContext.Implicits.global
|
import scala.concurrent.ExecutionContext.Implicits.global
|
||||||
@ -114,21 +114,26 @@ class SparkParallelismTracker(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class ErrorInXGBoostTraining(msg: String) extends ControlThrowable {
|
|
||||||
override def toString: String = s"ErrorInXGBoostTraining: $msg"
|
|
||||||
}
|
|
||||||
|
|
||||||
private[spark] class TaskFailedListener extends SparkListener {
|
private[spark] class TaskFailedListener extends SparkListener {
|
||||||
|
|
||||||
|
private[this] val logger = LogFactory.getLog("XGBoostTaskFailedListener")
|
||||||
|
|
||||||
override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
|
override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
|
||||||
taskEnd.reason match {
|
taskEnd.reason match {
|
||||||
case taskEnd: SparkListenerTaskEnd =>
|
case taskEndReason: TaskFailedReason =>
|
||||||
if (taskEnd.reason.isInstanceOf[TaskFailedReason]) {
|
logger.error(s"Training Task Failed during XGBoost Training: " +
|
||||||
throw new ErrorInXGBoostTraining(s"TaskFailed during XGBoost Training: " +
|
s"$taskEndReason, stopping SparkContext")
|
||||||
s"${taskEnd.reason}")
|
// Spark does not allow ListenerThread to shutdown SparkContext so that we have to do it
|
||||||
|
// in a separate thread
|
||||||
|
val sparkContextKiller = new Thread() {
|
||||||
|
override def run(): Unit = {
|
||||||
|
LiveListenerBus.withinListenerThread.withValue(false) {
|
||||||
|
SparkContext.getOrCreate().stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case executorRemoved: SparkListenerExecutorRemoved =>
|
sparkContextKiller.setDaemon(true)
|
||||||
throw new ErrorInXGBoostTraining(s"Executor lost during XGBoost Training: " +
|
sparkContextKiller.start()
|
||||||
s"${executorRemoved.reason}")
|
|
||||||
case _ =>
|
case _ =>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user