parthchandra commented on code in PR #53840:
URL: https://github.com/apache/spark/pull/53840#discussion_r2836918757
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala:
##########
@@ -35,6 +35,20 @@ import org.apache.spark.resource.ResourceProfile
*/
@DeveloperApi
abstract class AbstractPodsAllocator {
+ /*
+ * Optional lifecycle manager for tracking executor pod lifecycle events.
+ * Set via setExecutorPodsLifecycleManager for backward compatibility.
+ */
+ protected var executorPodsLifecycleManager: ExecutorPodsLifecycleManager = _
+
+ /*
+ * Set the lifecycle manager for tracking executor pod lifecycle events.
+ * This method is optional and may not exist in custom implementations based
on older versions.
+ */
+ def setExecutorPodsLifecycleManager(manager: ExecutorPodsLifecycleManager):
Unit = {
+ executorPodsLifecycleManager = manager
Review Comment:
Done
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,61 @@ class ExecutorPodsAllocator(
.build()
val resources = replacePVCsIfNeeded(
podWithAttachedContainer,
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
- val createdExecutorPod =
-
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
- try {
- addOwnerReference(createdExecutorPod, resources)
- resources
- .filter(_.getKind == "PersistentVolumeClaim")
- .foreach { resource =>
- if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
- addOwnerReference(driverPod.get, Seq(resource))
- }
- val pvc = resource.asInstanceOf[PersistentVolumeClaim]
- logInfo(log"Trying to create PersistentVolumeClaim " +
- log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)}
with " +
- log"StorageClass ${MDC(LogKeys.CLASS_NAME,
pvc.getSpec.getStorageClassName)}")
-
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
- PVC_COUNTER.incrementAndGet()
- }
- newlyCreatedExecutors(newExecutorId) = (resourceProfileId,
clock.getTimeMillis())
- logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+ val optCreatedExecutorPod = try {
+ Some(kubernetesClient
+ .pods()
+ .inNamespace(namespace)
+ .resource(podWithAttachedContainer)
+ .create())
} catch {
case NonFatal(e) =>
- kubernetesClient.pods()
- .inNamespace(namespace)
- .resource(createdExecutorPod)
- .delete()
- throw e
+ // Register failure with global tracker if lifecycle manager is
available
+ val failureCount = totalFailedPodCreations.incrementAndGet()
+ if (executorPodsLifecycleManager != null) {
+ executorPodsLifecycleManager.registerPodCreationFailure()
+ }
Review Comment:
Done
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala:
##########
@@ -184,13 +188,30 @@ private[spark] class KubernetesClusterManager extends
ExternalClusterManager wit
classOf[SparkConf], classOf[org.apache.spark.SecurityManager],
classOf[KubernetesExecutorBuilder], classOf[KubernetesClient],
classOf[ExecutorPodsSnapshotsStore], classOf[Clock])
- cstr.newInstance(
+ val allocatorInstance = cstr.newInstance(
sc.conf,
sc.env.securityManager,
new KubernetesExecutorBuilder(),
kubernetesClient,
snapshotsStore,
new SystemClock())
+
+ // Try to set the lifecycle manager using reflection for backward
compatibility
+ // with custom allocators that may not have this method
+ lifecycleManager.foreach { manager =>
+ try {
+ val setLifecycleManagerMethod = cls.getMethod(
+ "setExecutorPodsLifecycleManager",
+ classOf[ExecutorPodsLifecycleManager])
+ setLifecycleManagerMethod.invoke(allocatorInstance, manager)
+ } catch {
+ case _: NoSuchMethodException =>
+ logInfo("Allocator does not support setExecutorPodsLifecycleManager
method. " +
+ "Pod creation failures will not be tracked.")
Review Comment:
I was following the previous comment about needing to use reflection to
maintain backwards compatibility. But you're right reflection is not needed.
Removed.
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,61 @@ class ExecutorPodsAllocator(
.build()
val resources = replacePVCsIfNeeded(
podWithAttachedContainer,
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
- val createdExecutorPod =
-
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
- try {
- addOwnerReference(createdExecutorPod, resources)
- resources
- .filter(_.getKind == "PersistentVolumeClaim")
- .foreach { resource =>
- if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
- addOwnerReference(driverPod.get, Seq(resource))
- }
- val pvc = resource.asInstanceOf[PersistentVolumeClaim]
- logInfo(log"Trying to create PersistentVolumeClaim " +
- log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)}
with " +
- log"StorageClass ${MDC(LogKeys.CLASS_NAME,
pvc.getSpec.getStorageClassName)}")
-
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
- PVC_COUNTER.incrementAndGet()
- }
- newlyCreatedExecutors(newExecutorId) = (resourceProfileId,
clock.getTimeMillis())
- logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+ val optCreatedExecutorPod = try {
+ Some(kubernetesClient
+ .pods()
+ .inNamespace(namespace)
+ .resource(podWithAttachedContainer)
+ .create())
} catch {
case NonFatal(e) =>
- kubernetesClient.pods()
- .inNamespace(namespace)
- .resource(createdExecutorPod)
- .delete()
- throw e
+ // Register failure with global tracker if lifecycle manager is
available
+ val failureCount = totalFailedPodCreations.incrementAndGet()
+ if (executorPodsLifecycleManager != null) {
+ executorPodsLifecycleManager.registerPodCreationFailure()
+ }
+ logError(log"Failed to create executor pod
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+ log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)
Review Comment:
Done
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,53 @@ class ExecutorPodsAllocator(
.build()
val resources = replacePVCsIfNeeded(
podWithAttachedContainer,
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
- val createdExecutorPod =
-
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
- try {
- addOwnerReference(createdExecutorPod, resources)
- resources
- .filter(_.getKind == "PersistentVolumeClaim")
- .foreach { resource =>
- if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
- addOwnerReference(driverPod.get, Seq(resource))
- }
- val pvc = resource.asInstanceOf[PersistentVolumeClaim]
- logInfo(log"Trying to create PersistentVolumeClaim " +
- log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)}
with " +
- log"StorageClass ${MDC(LogKeys.CLASS_NAME,
pvc.getSpec.getStorageClassName)}")
-
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
- PVC_COUNTER.incrementAndGet()
- }
- newlyCreatedExecutors(newExecutorId) = (resourceProfileId,
clock.getTimeMillis())
- logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+ val optCreatedExecutorPod = try {
+ Some(kubernetesClient
+ .pods()
+ .inNamespace(namespace)
+ .resource(podWithAttachedContainer)
+ .create())
} catch {
case NonFatal(e) =>
- kubernetesClient.pods()
- .inNamespace(namespace)
- .resource(createdExecutorPod)
- .delete()
- throw e
+ // Register failure with global tracker if lifecycle manager is
available
+ val failureCount = totalFailedPodCreations.incrementAndGet()
+ if (executorPodsLifecycleManager != null) {
+ executorPodsLifecycleManager.registerPodCreationFailure()
+ }
+ logError(log"Failed to create executor pod
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+ log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)
+ None
+ }
+ optCreatedExecutorPod.foreach { createdExecutorPod =>
+ try {
Review Comment:
I think the exception being thrown is fine. It. is propagated up through
`requestNewExecutors() → onNewSnapshots()`. The exception is caught at
`ExecutorPodsSnapshotStoreImpl.processSnapshotsInternal` and logged as a
warning.
The test ` test("SPARK-41410: An exception during PVC creation should not
increase PVC counter")` is testing explicitly for an exception to be thrown in
this case. I modified the code to throw an Exception and the test passed (as
long as the exception is a KubernetesClientException).
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala:
##########
@@ -35,6 +35,20 @@ import org.apache.spark.resource.ResourceProfile
*/
@DeveloperApi
abstract class AbstractPodsAllocator {
+ /*
+ * Optional lifecycle manager for tracking executor pod lifecycle events.
+ * Set via setExecutorPodsLifecycleManager for backward compatibility.
+ */
+ protected var executorPodsLifecycleManager: ExecutorPodsLifecycleManager = _
Review Comment:
Done
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala:
##########
@@ -75,6 +75,14 @@ private[spark] class ExecutorPodsLifecycleManager(
protected[spark] def getNumExecutorsFailed: Int =
failureTracker.numFailedExecutors
+ /**
+ * Register a pod creation failure. This increments the global executor
failure count
+ * which is checked against spark.executor.maxNumFailures.
+ */
+ protected[spark] def registerPodCreationFailure(): Unit = {
Review Comment:
Changed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]