parthchandra commented on PR #53840: URL: https://github.com/apache/spark/pull/53840#issuecomment-3856938549
Sorry for taking so long to get back on this - Here's an example of the use case we are talking about. In this case we have a notebook server configured by an administrator for a maximum of 16 pods. The user requests a notebook with a Spark configuration with 32 executors. Because this exceeds the the 'quota' we get pod creation failures and Spark keeps trying to request pods. Setting `spark.kubernetes.allocation.maxPendingPods` had no effect. The Splunk log for this has the following repeated for every attempt - ``` timestamp="2026-02-05T00:54:20,029+0000",level="WARN",threadName="kubernetes-executor-snapshots-subscribers-0",appName="spark-driver",logger="org.apache.spark.scheduler.cluster.k8s.ExecutorPodsSnapshotsStoreImpl",jobName="notebooks-spark-job",sourceId="91xvti54j6iq",jobInstanceId="1770251444978-uiqn8uubzrtjq1t1f38rkpk00m33z",organizationName="Default",instanceName="SparkDriver",version="04106e4c-0ef4-4db4-addb-2de65a0d7c17",attemptId="1",message="Exception when notifying snapshot subscriber.",exception="io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://*************** Message: Unauthorized. Received status: Status(apiVersion=v1, code=401, details=null, kind=Status, message=Unauthorized, metadata=ListMeta(_continue=null, remainingItemCount=null, resourceVersion=null, selfLink=null, additionalProperties={}), reason=Unauthorized, status=Failure, additionalProperties={}). at io.fabric8.kubernetes.client.KubernetesClientException.copyAsCause(KubernetesClientException.java:238) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.waitForResult(OperationSupport.java:507) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.handleResponse(OperationSupport.java:524) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.handleCreate(OperationSupport.java:340) at io.fabric8.kubernetes.client.dsl.internal.BaseOperation.handleCreate(BaseOperation.java:754) at io.fabric8.kubernetes.client.dsl.internal.BaseOperation.handleCreate(BaseOperation.java:98) at io.fabric8.kubernetes.client.dsl.internal.CreateOnlyResourceOperation.create(CreateOnlyResourceOperation.java:42) at io.fabric8.kubernetes.client.dsl.internal.BaseOperation.create(BaseOperation.java:1155) at io.fabric8.kubernetes.client.dsl.internal.BaseOperation.create(BaseOperation.java:98) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$requestNewExecutors$1(ExecutorPodsAllocator.scala:440) at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:190) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.requestNewExecutors(ExecutorPodsAllocator.scala:417) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$onNewSnapshots$36(ExecutorPodsAllocator.scala:370) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$onNewSnapshots$36$adapted(ExecutorPodsAllocator.scala:363) at scala.collection.immutable.List.foreach(List.scala:334) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.onNewSnapshots(ExecutorPodsAllocator.scala:363) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$3(ExecutorPodsAllocator.scala:134) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$3$adapted(ExecutorPodsAllocator.scala:134) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsSnapshotsStoreImpl$SnapshotsSubscriber.org$apache$spark$scheduler$cluster$k8s$ExecutorPodsSnapshotsStoreImpl$SnapshotsSubscriber$$processSnapshotsInternal(ExecutorPodsSnapshotsStoreImpl.scala:143) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsSnapshotsStoreImpl$SnapshotsSubscriber.processSnapshots(ExecutorPodsSnapshotsStoreImpl.scala:131) at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsSnapshotsStoreImpl.$anonfun$addSubscriber$1(ExecutorPodsSnapshotsStoreImpl.scala:85) at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539) at java.base/java.util.concurrent.FutureTask.runAndReset(FutureTask.java:305) at java.base/java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:305) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.base/java.lang.Thread.run(Thread.java:840) Caused by: io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: POST at: https://192.168.0.1:443/************. Message: Unauthorized. Received status: Status(apiVersion=v1, code=401, details=null, kind=Status, message=Unauthorized, metadata=ListMeta(_continue=null, remainingItemCount=null, resourceVersion=null, selfLink=null, additionalProperties={}), reason=Unauthorized, status=Failure, additionalProperties={}). at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.requestFailure(OperationSupport.java:660) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.requestFailure(OperationSupport.java:640) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.assertResponseCode(OperationSupport.java:589) at io.fabric8.kubernetes.client.dsl.internal.OperationSupport.lambda$handleResponse$0(OperationSupport.java:549) at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:646) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2147) at io.fabric8.kubernetes.client.http.StandardHttpClient.lambda$completeOrCancel$10(StandardHttpClient.java:142) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2147) at io.fabric8.kubernetes.client.http.ByteArrayBodyHandler.onBodyDone(ByteArrayBodyHandler.java:51) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2147) at io.fabric8.kubernetes.client.okhttp.OkHttpClientImpl$OkHttpAsyncBody.doConsume(OkHttpClientImpl.java:136) ... 3 more" timestamp="2026-02-05T00:54:20,010+0000",level="INFO",threadName="kubernetes-executor-snapshots-subscribers-0",appName="spark-driver",logger="org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator",jobName="notebooks-spark-job",sourceId="91xvti54j6iq",jobInstanceId="1770251444978-uiqn8uubzrtjq1t1f38rkpk00m33z",organizationName="Default",instanceName="SparkDriver",version="04106e4c-0ef4-4db4-addb-2de65a0d7c17",attemptId="1",message="Cannot list PVC resources. Please check account permissions." timestamp="2026-02-05T00:54:19,996+0000",level="INFO",threadName="kubernetes-executor-snapshots-subscribers-0",appName="spark-driver",logger="org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator",jobName="notebooks-spark-job",sourceId="91xvti54j6iq",jobInstanceId="1770251444978-uiqn8uubzrtjq1t1f38rkpk00m33z",organizationName="Default",instanceName="SparkDriver",version="04106e4c-0ef4-4db4-addb-2de65a0d7c17",attemptId="1",message="Going to request 30 executors from Kubernetes for ResourceProfile Id: 0, target: 32, known: 0, sharedSlotFromPendingPods: 64." timestamp="2026-02-05T00:54:19,996+0000",level="DEBUG",threadName="kubernetes-executor-snapshots-subscribers-0",appName="spark-driver",logger="org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator",jobName="notebooks-spark-job",sourceId="91xvti54j6iq",jobInstanceId="1770251444978-uiqn8uubzrtjq1t1f38rkpk00m33z",organizationName="Default",instanceName="SparkDriver",version="04106e4c-0ef4-4db4-addb-2de65a0d7c17",attemptId="1",message="ResourceProfile Id: 0 (pod allocation status: 0 running, 0 unknown pending, 0 scheduler backend known pending, 0 unknown newly created, 0 scheduler backend known newly created)" ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
