[
https://issues.apache.org/jira/browse/HIVE-26459?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
zhangbutao updated HIVE-26459:
------------------------------
Description:
Cluster environment: Haoop 3.1.0, Hive 3.1.0, Tez 0.9.2
I often see some tez jobs stuck for a long time. I find partial tez reduce
tasks seems to wait all inputs ready, and if inputs are not ready, the reduce
tasks will wait forever.
I think we can invoke the timeout version of *_waitForAllInputsReady,_* and
this optimization was also mentioned in {*}HIVE-14042{*}. So, let's move
forward with this optimization.
{code:java}
"TezChild" #32 daemon prio=5 os_prio=0 tid=0x0000fffc0a7cb800 nid=0xd012
waiting on condition [0x0000fffbb9b5c000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x000000068c08cd60> (a
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
at
org.apache.tez.runtime.InputReadyTracker$InputReadyMonitor.awaitCondition(InputReadyTracker.java:147)
at
org.apache.tez.runtime.InputReadyTracker.waitForAllInputsReady(InputReadyTracker.java:107)
at
org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:141)
at
org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:136)
at
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordProcessor.init(ReduceRecordProcessor.java:122)
at
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:266)
at
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:250)
at
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1699)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at
com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:108)
at
com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:41)
at
com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:77)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) {code}
was:
Cluster environment: Haoop 3.1.0, Hive 3.1.0, Tez 0.9.2
I often see some tez jobs stuck for a long time. I find partial tez reduce
tasks seems to wait all inputs ready, and if inputs are not ready, the reduce
tasks will wait forever.
I think we can invoke the timeout version of *_waitForAllInputsReady,_* and __
this optimization was also mentioned in {*}HIVE-14042{*}. So, let's move
forward with this optimization.
{code:java}
"TezChild" #32 daemon prio=5 os_prio=0 tid=0x0000fffc0a7cb800 nid=0xd012
waiting on condition [0x0000fffbb9b5c000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x000000068c08cd60> (a
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
at
org.apache.tez.runtime.InputReadyTracker$InputReadyMonitor.awaitCondition(InputReadyTracker.java:147)
at
org.apache.tez.runtime.InputReadyTracker.waitForAllInputsReady(InputReadyTracker.java:107)
at
org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:141)
at
org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:136)
at
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordProcessor.init(ReduceRecordProcessor.java:122)
at
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:266)
at
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:250)
at
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at
org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1699)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at
org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at
com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:108)
at
com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:41)
at
com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:77)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) {code}
> ReduceRecordProcessor: move to using a timeout version of
> waitForAllInputsReady(TEZ-3302)
> -----------------------------------------------------------------------------------------
>
> Key: HIVE-26459
> URL: https://issues.apache.org/jira/browse/HIVE-26459
> Project: Hive
> Issue Type: Improvement
> Components: HiveServer2
> Affects Versions: 4.0.0-alpha-2
> Reporter: zhangbutao
> Assignee: zhangbutao
> Priority: Major
> Labels: pull-request-available
> Time Spent: 10m
> Remaining Estimate: 0h
>
> Cluster environment: Haoop 3.1.0, Hive 3.1.0, Tez 0.9.2
> I often see some tez jobs stuck for a long time. I find partial tez reduce
> tasks seems to wait all inputs ready, and if inputs are not ready, the reduce
> tasks will wait forever.
> I think we can invoke the timeout version of *_waitForAllInputsReady,_* and
> this optimization was also mentioned in {*}HIVE-14042{*}. So, let's move
> forward with this optimization.
>
> {code:java}
> "TezChild" #32 daemon prio=5 os_prio=0 tid=0x0000fffc0a7cb800 nid=0xd012
> waiting on condition [0x0000fffbb9b5c000]
> java.lang.Thread.State: WAITING (parking)
> at sun.misc.Unsafe.park(Native Method)
> - parking to wait for <0x000000068c08cd60> (a
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
> at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
> at
> org.apache.tez.runtime.InputReadyTracker$InputReadyMonitor.awaitCondition(InputReadyTracker.java:147)
> at
> org.apache.tez.runtime.InputReadyTracker.waitForAllInputsReady(InputReadyTracker.java:107)
> at
> org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:141)
> at
> org.apache.tez.runtime.api.impl.TezProcessorContextImpl.waitForAllInputsReady(TezProcessorContextImpl.java:136)
> at
> org.apache.hadoop.hive.ql.exec.tez.ReduceRecordProcessor.init(ReduceRecordProcessor.java:122)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:266)
> at
> org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:250)
> at
> org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
> at java.security.AccessController.doPrivileged(Native Method)
> at javax.security.auth.Subject.doAs(Subject.java:422)
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1699)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
> at
> org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
> at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:108)
> at
> com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:41)
> at
> com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:77)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)