[ 
https://issues.apache.org/jira/browse/HAWQ-978?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ming LI updated HAWQ-978:
-------------------------
    Description: 
One backend process on master had been running for several days and can't be 
terminated.
The session is idle on all segments but master instance.

pstack/strace/back trace of the backend process.

{code}
[gpadmin@alpmdwgp1prd ~]$ pstack 423984
Thread 2 (Thread 0x7f0457844700 (LWP 424026)):
#0  0x00007f04756670d3 in poll () from /lib64/libc.so.6
#1  0x0000000000b90114 in rxThreadFunc ()
#2  0x00007f0475e889d1 in start_thread () from /lib64/libpthread.so.0
#3  0x00007f04756708fd in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f047862b720 (LWP 423984)):
#0  0x00007f047568005e in __lll_lock_wait_private () from /lib64/libc.so.6
#1  0x00007f0475604dc0 in _L_lock_5199 () from /lib64/libc.so.6
#2  0x00007f047560071b in _int_free () from /lib64/libc.so.6
#3  0x0000000000b1be91 in gp_free2 ()
#4  0x0000000000b10acc in AllocSetDelete ()
#5  0x0000000000b1468b in MemoryContextDeleteImpl ()
#6  0x0000000000aaf0f1 in RelationDestroyRelation ()
#7  0x0000000000ab60f2 in RelationCacheInvalidate ()
#8  0x0000000000aa9453 in InvalidateSystemCaches ()
#9  0x0000000000937eeb in ReceiveSharedInvalidMessages ()
#10 0x000000000093c295 in LockRelationOid ()
#11 0x00000000004d8afd in heap_open ()
#12 0x0000000000aa46d4 in SearchCatCache ()
#13 0x00000000005c6512 in caql_getnext ()
#14 0x0000000000749153 in sql_exec_error_callback ()
#15 0x0000000000ad6e5a in errfinish ()
#16 0x0000000000ad8ed9 in elog_finish ()
#17 0x0000000000944e6b in handle_sig_alarm ()
#18 <signal handler called>
#19 0x00007f047560168f in _int_malloc () from /lib64/libc.so.6
#20 0x00007f04756026b1 in malloc () from /lib64/libc.so.6
#21 0x0000000000b1c2c1 in gp_malloc ()
#22 0x0000000000b1259c in AllocSetAlloc ()
#23 0x0000000000b15f5d in MemoryContextAllocZeroImpl ()
#24 0x0000000000b6cb4f in initMotionLayerStructs ()
#25 0x00000000007275e0 in ExecutorStart ()
#26 0x0000000000749a2e in fmgr_sql ()
#27 0x000000000072e316 in ExecMakeFunctionResultNoSets ()
#28 0x000000000072e129 in ExecMakeFunctionResultNoSets ()
#29 0x0000000000733312 in ExecProject ()
#30 0x00000000007602c7 in ExecHashJoin ()
#31 0x000000000072ca84 in ExecProcNode ()
#32 0x000000000076bf38 in ExecSort ()
#33 0x000000000072caa6 in ExecProcNode ()
#34 0x000000000072199c in ExecutePlan ()
#35 0x00000000007221a8 in ExecutorRun ()
#36 0x0000000000971e09 in PortalRun ()
#37 0x0000000000966968 in exec_simple_query ()
#38 0x0000000000969ab9 in PostgresMain ()
#39 0x00000000008c707e in ServerLoop ()
#40 0x00000000008c9e20 in PostmasterMain ()
#41 0x00000000007c85af in main ()
{code}

  was:
One backend process on master had been running for several days and can't be 
terminated.
The session is idle on all segments but master instance.

pstack/strace/back trace of the backend process.

{code}
[gpadmin@avw7hdm2p1 ~]$ pstack 431263
Thread 2 (Thread 0x7f4c93aa2700 (LWP 431264)):
#0  0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6
#1  0x0000000000ba8294 in rxThreadFunc ()
#2  0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0
#3  0x00007f4c901488fd in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f4c93af48e0 (LWP 431263)):
#0  0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6
#1  0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6
#2  0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6
#3  0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6
#4  0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6
#5  0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6
#6  0x00007f4c6fabcfe3 in Rf_onsigusr1 () from /usr/local/lib64/R/lib/libR.so
#7  <signal handler called>
#8  0x00007f4c9014079a in brk () from /lib64/libc.so.6
#9  0x00007f4c90140845 in sbrk () from /lib64/libc.so.6
#10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6
#11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6
#12 0x0000000000b3ff24 in gp_free2 ()
#13 0x0000000000b356fc in AllocSetDelete ()
#14 0x0000000000b38391 in MemoryContextDeleteImpl ()
#15 0x000000000077c851 in ExecEndAgg ()
#16 0x00000000007592ad in ExecEndNode ()
#17 0x000000000075186c in ExecEndPlan ()
#18 0x000000000079dffa in ExecEndSubqueryScan ()
#19 0x000000000075921d in ExecEndNode ()
#20 0x000000000075186c in ExecEndPlan ()
#21 0x0000000000752565 in ExecutorEnd ()
#22 0x00000000006dd9bd in PortalCleanup ()
#23 0x0000000000b3f077 in AtCommit_Portals ()
#24 0x000000000051abe5 in CommitTransaction ()
#25 0x000000000051f1d5 in CommitTransactionCommand ()
#26 0x000000000099809e in PostgresMain ()
#27 0x00000000008f1031 in BackendStartup ()
#28 0x00000000008f70e0 in PostmasterMain ()
#29 0x00000000007f63da in main ()
[gpadmin@avw7hdm2p1 ~]$


[gpadmin@avw7hdm2p1 ~]$ strace -p 431263
Process 431263 attached - interrupt to quit
futex(0x7f4c903efe80, FUTEX_WAIT_PRIVATE, 2, NULL^C <unfinished ...>
Process 431263 detached
[gpadmin@avw7hdm2p1 ~]$



(gdb) thread apply all bt

Thread 2 (Thread 0x7f4c93af48e0 (LWP 431263)):
#0  0x00007f4c9015805e in __lll_lock_wait_private () from /lib64/libc.so.6
#1  0x00007f4c900dd16b in _L_lock_9503 () from /lib64/libc.so.6
#2  0x00007f4c900da6a6 in malloc () from /lib64/libc.so.6
#3  0x00007f4c9008fb39 in _nl_make_l10nflist () from /lib64/libc.so.6
#4  0x00007f4c9008ddf5 in _nl_find_domain () from /lib64/libc.so.6
#5  0x00007f4c9008d6e0 in __dcigettext () from /lib64/libc.so.6
#6  0x00007f4c6fabcfe3 in Rf_onsigusr1 (dummy=<value optimized out>) at 
errors.c:178
#7  <signal handler called>
#8  0x00007f4c9014079a in brk () from /lib64/libc.so.6
#9  0x00007f4c90140845 in sbrk () from /lib64/libc.so.6
#10 0x00007f4c900dd769 in __default_morecore () from /lib64/libc.so.6
#11 0x00007f4c900d87a2 in _int_free () from /lib64/libc.so.6
#12 0x0000000000b3ff24 in gp_free2 (ptr=0x191c3b000, sz=0) at memprot.c:808
#13 0x0000000000b356fc in AllocSetDelete (context=<value optimized out>) at 
aset.c:981
#14 0x0000000000b38391 in MemoryContextDeleteImpl (context=0x4a46da0, 
sfile=0x0, func=<value optimized out>, sline=-1) at mcxt.c:232
#15 MemoryContextDeleteChildren (context=0x4a46da0, sfile=0x0, func=<value 
optimized out>, sline=-1) at mcxt.c:251
#16 MemoryContextDeleteImpl (context=0x4a46da0, sfile=0x0, func=<value 
optimized out>, sline=-1) at mcxt.c:205
#17 0x000000000077c851 in ExecEndAgg (node=0x325eb00) at nodeAgg.c:2641
#18 0x00000000007592ad in ExecEndNode (node=0x325eb00) at execProcnode.c:1687
#19 0x000000000075186c in ExecEndPlan (planstate=0x325eb00, estate=0x323f9e8) 
at execMain.c:2825
#20 0x000000000079dffa in ExecEndSubqueryScan (node=0x325cd20) at 
nodeSubqueryscan.c:294
#21 0x000000000075921d in ExecEndNode (node=0x325cd20) at execProcnode.c:1638
#22 0x000000000075186c in ExecEndPlan (planstate=0x325cd20, estate=0x323f010) 
at execMain.c:2825
#23 0x0000000000752565 in ExecutorEnd (queryDesc=<value optimized out>) at 
execMain.c:1321
#24 0x00000000006dd9bd in PortalCleanupHelper (portal=<value optimized out>) at 
portalcmds.c:366
#25 PortalCleanup (portal=<value optimized out>) at portalcmds.c:302
#26 0x0000000000b3f077 in PortalDrop () at portalmem.c:402
#27 AtCommit_Portals () at portalmem.c:643
#28 0x000000000051abe5 in CommitTransaction () at xact.c:3379
#29 0x000000000051f1d5 in CommitTransactionCommand () at xact.c:4535
#30 0x000000000099809e in finish_xact_command (argc=<value optimized out>, 
argv=<value optimized out>, username=<value optimized out>) at postgres.c:3180
#31 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, 
username=<value optimized out>) at postgres.c:5260
#32 0x00000000008f1031 in BackendRun (port=0x2aa5520) at postmaster.c:6811
#33 BackendStartup (port=0x2aa5520) at postmaster.c:6408
#34 0x00000000008f70e0 in ServerLoop (argc=<value optimized out>, argv=<value 
optimized out>) at postmaster.c:2350
#35 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at 
postmaster.c:1556
#36 0x00000000007f63da in main (argc=18, argv=0x2aa1270) at main.c:217

Thread 1 (Thread 0x7f4c93aa2700 (LWP 431264)):
#0  0x00007f4c9013f0d3 in poll () from /lib64/libc.so.6
#1  0x0000000000ba8294 in rxThreadFunc (arg=<value optimized out>) at 
ic_udp.c:6263
#2  0x00007f4c9101f9d1 in start_thread () from /lib64/libpthread.so.0
#3  0x00007f4c901488fd in clone () from /lib64/libc.so.6
(gdb)
{code}


> long running query got hang on master and can't be terminated
> -------------------------------------------------------------
>
>                 Key: HAWQ-978
>                 URL: https://issues.apache.org/jira/browse/HAWQ-978
>             Project: Apache HAWQ
>          Issue Type: Bug
>            Reporter: Ming LI
>            Assignee: Lei Chang
>
> One backend process on master had been running for several days and can't be 
> terminated.
> The session is idle on all segments but master instance.
> pstack/strace/back trace of the backend process.
> {code}
> [gpadmin@alpmdwgp1prd ~]$ pstack 423984
> Thread 2 (Thread 0x7f0457844700 (LWP 424026)):
> #0  0x00007f04756670d3 in poll () from /lib64/libc.so.6
> #1  0x0000000000b90114 in rxThreadFunc ()
> #2  0x00007f0475e889d1 in start_thread () from /lib64/libpthread.so.0
> #3  0x00007f04756708fd in clone () from /lib64/libc.so.6
> Thread 1 (Thread 0x7f047862b720 (LWP 423984)):
> #0  0x00007f047568005e in __lll_lock_wait_private () from /lib64/libc.so.6
> #1  0x00007f0475604dc0 in _L_lock_5199 () from /lib64/libc.so.6
> #2  0x00007f047560071b in _int_free () from /lib64/libc.so.6
> #3  0x0000000000b1be91 in gp_free2 ()
> #4  0x0000000000b10acc in AllocSetDelete ()
> #5  0x0000000000b1468b in MemoryContextDeleteImpl ()
> #6  0x0000000000aaf0f1 in RelationDestroyRelation ()
> #7  0x0000000000ab60f2 in RelationCacheInvalidate ()
> #8  0x0000000000aa9453 in InvalidateSystemCaches ()
> #9  0x0000000000937eeb in ReceiveSharedInvalidMessages ()
> #10 0x000000000093c295 in LockRelationOid ()
> #11 0x00000000004d8afd in heap_open ()
> #12 0x0000000000aa46d4 in SearchCatCache ()
> #13 0x00000000005c6512 in caql_getnext ()
> #14 0x0000000000749153 in sql_exec_error_callback ()
> #15 0x0000000000ad6e5a in errfinish ()
> #16 0x0000000000ad8ed9 in elog_finish ()
> #17 0x0000000000944e6b in handle_sig_alarm ()
> #18 <signal handler called>
> #19 0x00007f047560168f in _int_malloc () from /lib64/libc.so.6
> #20 0x00007f04756026b1 in malloc () from /lib64/libc.so.6
> #21 0x0000000000b1c2c1 in gp_malloc ()
> #22 0x0000000000b1259c in AllocSetAlloc ()
> #23 0x0000000000b15f5d in MemoryContextAllocZeroImpl ()
> #24 0x0000000000b6cb4f in initMotionLayerStructs ()
> #25 0x00000000007275e0 in ExecutorStart ()
> #26 0x0000000000749a2e in fmgr_sql ()
> #27 0x000000000072e316 in ExecMakeFunctionResultNoSets ()
> #28 0x000000000072e129 in ExecMakeFunctionResultNoSets ()
> #29 0x0000000000733312 in ExecProject ()
> #30 0x00000000007602c7 in ExecHashJoin ()
> #31 0x000000000072ca84 in ExecProcNode ()
> #32 0x000000000076bf38 in ExecSort ()
> #33 0x000000000072caa6 in ExecProcNode ()
> #34 0x000000000072199c in ExecutePlan ()
> #35 0x00000000007221a8 in ExecutorRun ()
> #36 0x0000000000971e09 in PortalRun ()
> #37 0x0000000000966968 in exec_simple_query ()
> #38 0x0000000000969ab9 in PostgresMain ()
> #39 0x00000000008c707e in ServerLoop ()
> #40 0x00000000008c9e20 in PostmasterMain ()
> #41 0x00000000007c85af in main ()
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to