[ 
https://issues.apache.org/jira/browse/SPARK-35304?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Keunhyun Oh updated SPARK-35304:
--------------------------------
    Description: 
Though finishing a job, the driver pod is running infinitely.

Executors are terminated. However, the driver status is not changed to 
succeeded.

It is not experienced in spark 2 on k8s.

It is only appeared on spark 3.

 

my jvm dump is that
{code:java}
2021-05-04 15:11:37
Full thread dump OpenJDK 64-Bit Server VM (25.252-b09 mixed mode):

"Attach Listener" #182 daemon prio=9 os_prio=0 tid=0x00007f02bc001000 nid=0x106 
waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"DestroyJavaVM" #179 prio=5 os_prio=0 tid=0x00007f0fe0017000 nid=0x35 waiting 
on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"s3a-transfer-unbounded-pool2-t1" #172 daemon prio=5 os_prio=0 
tid=0x00007f025d98d000 nid=0xe5 waiting on condition [0x00007f01f86f3000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00007f0353681b38> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - None

"java-sdk-progress-listener-callback-thread" #169 daemon prio=5 os_prio=0 
tid=0x00007f002000f000 nid=0xe2 waiting on condition [0x00007f004f7f6000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00007f0bdb1ba7c0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - None

"pool-26-thread-1" #72 prio=5 os_prio=0 tid=0x00007f025c829000 nid=0x80 waiting 
on condition [0x00007f01ba931000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00007f0bfdeaa8f0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - None

"java-sdk-http-connection-reaper" #56 daemon prio=5 os_prio=0 
tid=0x00007f025d818000 nid=0x6e waiting on condition [0x00007f01fb9fe000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at 
com.amazonaws.http.IdleConnectionReaper.run(IdleConnectionReaper.java:188)

   Locked ownable synchronizers:
        - None

"Timer for 's3a-file-system' metrics system" #55 daemon prio=5 os_prio=0 
tid=0x00007f0fe19e6800 nid=0x6d in Object.wait() [0x00007f029c1d8000]
   java.lang.Thread.State: TIMED_WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.util.TimerThread.mainLoop(Timer.java:552)
        - locked <0x00007f0353383bd0> (a java.util.TaskQueue)
        at java.util.TimerThread.run(Timer.java:505)

   Locked ownable synchronizers:
        - None

"MutableQuantiles-0" #54 daemon prio=5 os_prio=0 tid=0x00007f025d78b800 
nid=0x6c runnable [0x00007f029c2d9000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00007f0351a09dd8> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - None

"org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner" #13 
daemon prio=5 os_prio=0 tid=0x00007f0fe1182000 nid=0x45 in Object.wait() 
[0x00007f02c50d7000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144)
        - locked <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165)
        at 
org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner.run(FileSystem.java:3839)
        at java.lang.Thread.run(Thread.java:748)

   Locked ownable synchronizers:
        - None

"Service Thread" #7 daemon prio=9 os_prio=0 tid=0x00007f0fe00e2000 nid=0x3f 
runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"C1 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007f0fe00c7800 nid=0x3e 
waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007f0fe00c4800 nid=0x3d 
waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007f0fe00c2800 nid=0x3c 
runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

   Locked ownable synchronizers:
        - None

"Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007f0fe0090000 nid=0x3b in 
Object.wait() [0x00007f033fffe000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144)
        - locked <0x00007f03504176d8> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:216)

   Locked ownable synchronizers:
        - None

"Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007f0fe008b800 nid=0x3a 
in Object.wait() [0x00007f034416a000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.Object.wait(Object.java:502)
        at java.lang.ref.Reference.tryHandlePending(Reference.java:191)
        - locked <0x00007f0350424c20> (a java.lang.ref.Reference$Lock)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:153)

   Locked ownable synchronizers:
        - None

"VM Thread" os_prio=0 tid=0x00007f0fe0082000 nid=0x39 runnable 

"GC task thread#0 (ParallelGC)" os_prio=0 tid=0x00007f0fe002c000 nid=0x36 
runnable 

"GC task thread#1 (ParallelGC)" os_prio=0 tid=0x00007f0fe002d800 nid=0x37 
runnable 

"GC task thread#2 (ParallelGC)" os_prio=0 tid=0x00007f0fe002f800 nid=0x38 
runnable 

"VM Periodic Task Thread" os_prio=0 tid=0x00007f0fe00ec800 nid=0x40 waiting on 
condition 

JNI global references: 6244
 {code}

  was:
Though finishing a job, the driver pod is running infinitely.

Executors are terminated. However, the driver status is not changed to 
succeeded.

It is not experienced in spark 2 on k8s.

It is only appeared on spark 3.


> [k8s] Though finishing a job, the driver pod is running infinitely
> ------------------------------------------------------------------
>
>                 Key: SPARK-35304
>                 URL: https://issues.apache.org/jira/browse/SPARK-35304
>             Project: Spark
>          Issue Type: Bug
>          Components: Kubernetes
>    Affects Versions: 3.0.1, 3.0.2
>            Reporter: Keunhyun Oh
>            Priority: Major
>
> Though finishing a job, the driver pod is running infinitely.
> Executors are terminated. However, the driver status is not changed to 
> succeeded.
> It is not experienced in spark 2 on k8s.
> It is only appeared on spark 3.
>  
> my jvm dump is that
> {code:java}
> 2021-05-04 15:11:37
> Full thread dump OpenJDK 64-Bit Server VM (25.252-b09 mixed mode):
> "Attach Listener" #182 daemon prio=9 os_prio=0 tid=0x00007f02bc001000 
> nid=0x106 waiting on condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "DestroyJavaVM" #179 prio=5 os_prio=0 tid=0x00007f0fe0017000 nid=0x35 waiting 
> on condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "s3a-transfer-unbounded-pool2-t1" #172 daemon prio=5 os_prio=0 
> tid=0x00007f025d98d000 nid=0xe5 waiting on condition [0x00007f01f86f3000]
>    java.lang.Thread.State: WAITING (parking)
>       at sun.misc.Unsafe.park(Native Method)
>       - parking to wait for  <0x00007f0353681b38> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>       at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
>       at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
>       at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
>       at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>       at java.lang.Thread.run(Thread.java:748)
>    Locked ownable synchronizers:
>       - None
> "java-sdk-progress-listener-callback-thread" #169 daemon prio=5 os_prio=0 
> tid=0x00007f002000f000 nid=0xe2 waiting on condition [0x00007f004f7f6000]
>    java.lang.Thread.State: WAITING (parking)
>       at sun.misc.Unsafe.park(Native Method)
>       - parking to wait for  <0x00007f0bdb1ba7c0> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>       at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
>       at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
>       at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
>       at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>       at java.lang.Thread.run(Thread.java:748)
>    Locked ownable synchronizers:
>       - None
> "pool-26-thread-1" #72 prio=5 os_prio=0 tid=0x00007f025c829000 nid=0x80 
> waiting on condition [0x00007f01ba931000]
>    java.lang.Thread.State: WAITING (parking)
>       at sun.misc.Unsafe.park(Native Method)
>       - parking to wait for  <0x00007f0bfdeaa8f0> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>       at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
>       at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
>       at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
>       at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>       at java.lang.Thread.run(Thread.java:748)
>    Locked ownable synchronizers:
>       - None
> "java-sdk-http-connection-reaper" #56 daemon prio=5 os_prio=0 
> tid=0x00007f025d818000 nid=0x6e waiting on condition [0x00007f01fb9fe000]
>    java.lang.Thread.State: TIMED_WAITING (sleeping)
>       at java.lang.Thread.sleep(Native Method)
>       at 
> com.amazonaws.http.IdleConnectionReaper.run(IdleConnectionReaper.java:188)
>    Locked ownable synchronizers:
>       - None
> "Timer for 's3a-file-system' metrics system" #55 daemon prio=5 os_prio=0 
> tid=0x00007f0fe19e6800 nid=0x6d in Object.wait() [0x00007f029c1d8000]
>    java.lang.Thread.State: TIMED_WAITING (on object monitor)
>       at java.lang.Object.wait(Native Method)
>       at java.util.TimerThread.mainLoop(Timer.java:552)
>       - locked <0x00007f0353383bd0> (a java.util.TaskQueue)
>       at java.util.TimerThread.run(Timer.java:505)
>    Locked ownable synchronizers:
>       - None
> "MutableQuantiles-0" #54 daemon prio=5 os_prio=0 tid=0x00007f025d78b800 
> nid=0x6c runnable [0x00007f029c2d9000]
>    java.lang.Thread.State: TIMED_WAITING (parking)
>       at sun.misc.Unsafe.park(Native Method)
>       - parking to wait for  <0x00007f0351a09dd8> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>       at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>       at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>       at 
> java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093)
>       at 
> java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809)
>       at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>       at java.lang.Thread.run(Thread.java:748)
>    Locked ownable synchronizers:
>       - None
> "org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner" 
> #13 daemon prio=5 os_prio=0 tid=0x00007f0fe1182000 nid=0x45 in Object.wait() 
> [0x00007f02c50d7000]
>    java.lang.Thread.State: WAITING (on object monitor)
>       at java.lang.Object.wait(Native Method)
>       - waiting on <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock)
>       at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144)
>       - locked <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock)
>       at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165)
>       at 
> org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner.run(FileSystem.java:3839)
>       at java.lang.Thread.run(Thread.java:748)
>    Locked ownable synchronizers:
>       - None
> "Service Thread" #7 daemon prio=9 os_prio=0 tid=0x00007f0fe00e2000 nid=0x3f 
> runnable [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "C1 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007f0fe00c7800 
> nid=0x3e waiting on condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007f0fe00c4800 
> nid=0x3d waiting on condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007f0fe00c2800 
> nid=0x3c runnable [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
>    Locked ownable synchronizers:
>       - None
> "Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007f0fe0090000 nid=0x3b in 
> Object.wait() [0x00007f033fffe000]
>    java.lang.Thread.State: WAITING (on object monitor)
>       at java.lang.Object.wait(Native Method)
>       at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144)
>       - locked <0x00007f03504176d8> (a java.lang.ref.ReferenceQueue$Lock)
>       at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165)
>       at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:216)
>    Locked ownable synchronizers:
>       - None
> "Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007f0fe008b800 
> nid=0x3a in Object.wait() [0x00007f034416a000]
>    java.lang.Thread.State: WAITING (on object monitor)
>       at java.lang.Object.wait(Native Method)
>       at java.lang.Object.wait(Object.java:502)
>       at java.lang.ref.Reference.tryHandlePending(Reference.java:191)
>       - locked <0x00007f0350424c20> (a java.lang.ref.Reference$Lock)
>       at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:153)
>    Locked ownable synchronizers:
>       - None
> "VM Thread" os_prio=0 tid=0x00007f0fe0082000 nid=0x39 runnable 
> "GC task thread#0 (ParallelGC)" os_prio=0 tid=0x00007f0fe002c000 nid=0x36 
> runnable 
> "GC task thread#1 (ParallelGC)" os_prio=0 tid=0x00007f0fe002d800 nid=0x37 
> runnable 
> "GC task thread#2 (ParallelGC)" os_prio=0 tid=0x00007f0fe002f800 nid=0x38 
> runnable 
> "VM Periodic Task Thread" os_prio=0 tid=0x00007f0fe00ec800 nid=0x40 waiting 
> on condition 
> JNI global references: 6244
>  {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to