@@ -32,6 +32,7 @@ import scala.concurrent.{ExecutionContext, Future}
3232import org .apache .spark .{SparkEnv , SparkException }
3333import org .apache .spark .deploy .k8s .config ._
3434import org .apache .spark .deploy .k8s .constants ._
35+ import org .apache .spark .internal .config ._
3536import org .apache .spark .rpc .{RpcAddress , RpcCallContext , RpcEndpointAddress , RpcEnv }
3637import org .apache .spark .scheduler .{ExecutorExited , SlaveLost , TaskSchedulerImpl }
3738import org .apache .spark .scheduler .cluster .CoarseGrainedClusterMessages .{RetrieveSparkAppConfig , SparkAppConfig }
@@ -54,6 +55,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
5455 private val RUNNING_EXECUTOR_PODS_LOCK = new Object
5556 // Indexed by executor IDs and guarded by RUNNING_EXECUTOR_PODS_LOCK.
5657 private val runningExecutorsToPods = new mutable.HashMap [String , Pod ]
58+ // Executors names with failed status and guarded by RUNNING_EXECUTOR_PODS_LOCK.
59+ private val failedExecutors = new mutable.HashSet [String ]
5760 // Indexed by executor pod names and guarded by RUNNING_EXECUTOR_PODS_LOCK.
5861 private val runningPodsToExecutors = new mutable.HashMap [String , String ]
5962 private val executorPodsByIPs = new ConcurrentHashMap [String , Pod ]()
@@ -114,19 +117,20 @@ private[spark] class KubernetesClusterSchedulerBackend(
114117 override def run (): Unit = {
115118 handleDisconnectedExecutors()
116119 RUNNING_EXECUTOR_PODS_LOCK .synchronized {
117- if (totalRegisteredExecutors.get() < runningExecutorsToPods.size ) {
120+ if (totalRegisteredExecutors.get() < runningExecutorSize() ) {
118121 logDebug(" Waiting for pending executors before scaling" )
119- } else if (totalExpectedExecutors.get() <= runningExecutorsToPods.size ) {
122+ } else if (totalExpectedExecutors.get() <= runningExecutorSize() ) {
120123 logDebug(" Maximum allowed executor limit reached. Not scaling up further." )
121124 } else {
122125 val nodeToLocalTaskCount = getNodesWithLocalTaskCounts
123126 for (i <- 0 until math.min(
124- totalExpectedExecutors.get - runningExecutorsToPods.size , podAllocationSize)) {
127+ totalExpectedExecutors.get - runningExecutorSize() , podAllocationSize)) {
125128 val (executorId, pod) = allocateNewExecutorPod(nodeToLocalTaskCount)
126129 runningExecutorsToPods.put(executorId, pod)
127130 runningPodsToExecutors.put(pod.getMetadata.getName, executorId)
128131 logInfo(
129- s " Requesting a new executor, total executors is now ${runningExecutorsToPods.size}" )
132+ s " Requesting a new executor $executorId, total executors is now " +
133+ s " ${runningExecutorSize()} ( ${failedExecutors.size} failed) " )
130134 }
131135 }
132136 }
@@ -172,9 +176,33 @@ private[spark] class KubernetesClusterSchedulerBackend(
172176 runningExecutorsToPods.remove(executorId).map { pod =>
173177 kubernetesClient.pods().delete(pod)
174178 runningPodsToExecutors.remove(pod.getMetadata.getName)
179+ failedExecutors -= pod.getMetadata.getName
175180 }.getOrElse(logWarning(s " Unable to remove pod for unknown executor $executorId" ))
176181 }
177182 }
183+
184+ // It represent current created executors exclude failed one.
185+ // To avoid create too many failed executor,
186+ // we limit the accounting size of failed executors to maxNumExecutorFailures
187+ // So after create totalExpectedExecutors + maxNumExecutorFailures executors,
188+ // we stop create more even if all of them failed
189+ def runningExecutorSize (): Int = runningExecutorsToPods.size -
190+ math.min(failedExecutors.size, maxNumExecutorFailures)
191+
192+ // Default to twice the number of executors (twice the maximum number of executors if dynamic
193+ // allocation is enabled), with a minimum of 3.
194+ val maxNumExecutorFailures = {
195+ val effectiveNumExecutors =
196+ if (Utils .isDynamicAllocationEnabled(conf)) {
197+ conf.get(DYN_ALLOCATION_MAX_EXECUTORS )
198+ } else {
199+ conf.get(EXECUTOR_INSTANCES ).getOrElse(0 )
200+ }
201+ // By default, effectiveNumExecutors is Int.MaxValue if dynamic allocation is enabled. We need
202+ // avoid the integer overflow here.
203+ math.max(3 ,
204+ if (effectiveNumExecutors > Int .MaxValue / 2 ) Int .MaxValue else 2 * effectiveNumExecutors)
205+ }
178206 }
179207
180208 private def getInitialTargetExecutorNumber (defaultNumExecutors : Int = 1 ): Int = {
@@ -233,6 +261,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
233261 runningExecutorsToPods.values.foreach(kubernetesClient.pods().delete(_))
234262 runningExecutorsToPods.clear()
235263 runningPodsToExecutors.clear()
264+ failedExecutors.clear()
236265 }
237266 executorPodsByIPs.clear()
238267 val resource = executorWatchResource.getAndSet(null )
@@ -311,6 +340,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
311340 kubernetesClient.pods().delete(executorPod)
312341 disconnectedPodsByExecutorIdPendingRemoval.put(executor, executorPod)
313342 runningPodsToExecutors.remove(executorPod.getMetadata.getName)
343+ failedExecutors -= executorPod.getMetadata.getName
314344 }
315345 if (maybeRemovedExecutor.isEmpty) {
316346 logWarning(s " Unable to remove pod for unknown executor $executor" )
@@ -354,6 +384,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
354384 logInfo(s " Received delete pod $podName event. Reason: " + pod.getStatus.getReason)
355385 handleDeletedPod(pod)
356386 }
387+ } else if (action == Action .MODIFIED && pod.getStatus.getPhase == " Failed" ) {
388+ logError(s " Executor pod ${pod.getMetadata.getName} failed with container status " +
389+ s " ${pod.getStatus.getContainerStatuses}" )
390+ handleFailedPod(pod)
357391 }
358392 }
359393
@@ -407,6 +441,13 @@ private[spark] class KubernetesClusterSchedulerBackend(
407441 podsWithKnownExitReasons.put(pod.getMetadata.getName, exitReason)
408442 }
409443
444+ def handleFailedPod (pod : Pod ): Unit = {
445+ RUNNING_EXECUTOR_PODS_LOCK .synchronized {
446+ failedExecutors += pod.getMetadata.getName
447+ }
448+ handleErroredPod(pod)
449+ }
450+
410451 def handleDeletedPod (pod : Pod ): Unit = {
411452 val exitMessage = if (isPodAlreadyReleased(pod)) {
412453 s " Container in pod ${pod.getMetadata.getName} exited from explicit termination request. "
0 commit comments