[GitHub] [spark] dongjoon-hyun commented on a diff in pull request #39825: [SPARK-42261][SPARK-42260][K8S] Log Allocation Stalls and Trigger Allocation event without blocking on snapshot

via GitHub Tue, 07 Feb 2023 21:02:14 -0800


dongjoon-hyun commented on code in PR #39825:
URL: https://github.com/apache/spark/pull/39825#discussion_r1099652817



##########
resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala:
##########
@@ -186,7 +186,107 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite 
with BeforeAndAfter {
     rpb.require(ereq).require(treq)
     val rp = rpb.build()
 
-    val confWithLowMaxPendingPods = 
conf.clone.set(KUBERNETES_MAX_PENDING_PODS.key, "3")
+    val confWithMediumMaxPendingPods = conf.clone.set(
+      KUBERNETES_MAX_PENDING_PODS.key, "20").set(
+      KUBERNETES_ALLOCATION_BLOCK_ON_SNAPSHOT.key, "false").set(
+      KUBERNETES_ALLOCATION_BATCH_SIZE.key, "20")
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+    podsAllocatorUnderTest = new 
ExecutorPodsAllocator(confWithMediumMaxPendingPods, secMgr,
+      executorBuilder, kubernetesClient, snapshotsStore, 
waitForExecutorPodsClock)
+    podsAllocatorUnderTest.start(TEST_SPARK_APP_ID, schedulerBackend)
+
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 2, 
rp -> 3))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
+    // We should have allocated something in each RPI so we don't count as 
"stalled."
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+
+    // We should not yet have stalled.
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(rp -> 100))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 20)
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+
+    // And now we stall.
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(rp -> 200))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 20)
+    assert(podsAllocatorUnderTest.stalledStartTime != null)
+  }
+
+  test("SPARK-42261: Don't allow allocations without snapshot by default 
(except new rpID)") {
+    when(podOperations
+      .withField("status.phase", "Pending"))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabel(SPARK_APP_ID_LABEL, TEST_SPARK_APP_ID))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabelIn(meq(SPARK_EXECUTOR_ID_LABEL), any()))
+      .thenReturn(podOperations)
+
+    val startTime = Instant.now.toEpochMilli
+    waitForExecutorPodsClock.setTime(startTime)
+
+    val rpb = new ResourceProfileBuilder()
+    val ereq = new ExecutorResourceRequests()
+    val treq = new TaskResourceRequests()
+    ereq.cores(4).memory("2g")
+    treq.cpus(2)
+    rpb.require(ereq).require(treq)
+    val rp = rpb.build()
+
+    val confWithMediumMaxPendingPods = conf.clone.set(
+      KUBERNETES_MAX_PENDING_PODS.key, "20").set(
+      KUBERNETES_ALLOCATION_BATCH_SIZE.key, "20")
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+    podsAllocatorUnderTest = new 
ExecutorPodsAllocator(confWithMediumMaxPendingPods, secMgr,
+      executorBuilder, kubernetesClient, snapshotsStore, 
waitForExecutorPodsClock)
+    podsAllocatorUnderTest.start(TEST_SPARK_APP_ID, schedulerBackend)
+
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 2, 
rp -> 3))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
+    // We should have allocated something in each RPI so we don't count as 
"stalled."
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(rp -> 100))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(rp -> 200))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
+    assert(podsAllocatorUnderTest.stalledStartTime == null)
+  }
+
+  test("SPARK-36052: pending pod limit with multiple resource profiles & 
SPARK-42261") {

Review Comment:
   Oh, is this the original test case? Then, could you move the following test 
cases after `SPARK-36052` test case?
   ```
   test("SPARK-42261: Allow allocations without snapshot up to min of max 
pending & alloc size.") {
   test("SPARK-42261: Don't allow allocations without snapshot by default 
(except new rpID)") {
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] dongjoon-hyun commented on a diff in pull request #39825: [SPARK-42261][SPARK-42260][K8S] Log Allocation Stalls and Trigger Allocation event without blocking on snapshot

Reply via email to