On Mon, Jun 29, 2015 at 11:28 PM, Jeff Janes <jeff.ja...@gmail.com> wrote:
> On Mon, Jun 29, 2015 at 5:55 PM, Peter Geoghegan <p...@heroku.com> wrote: > >> On Mon, Jun 29, 2015 at 5:37 PM, Jeff Janes <jeff.ja...@gmail.com> wrote: >> > Is there a way to use gdb to figure out who holds the lock they are >> waiting >> > for? >> >> Have you considered building with LWLOCK_STATS defined, and LOCK_DEBUG >> defined? That might do it. >> > > I hadn't planned on running into this problem, so had not compiled > accordingly. > > I thought LOCK_DEBUG would produce too much output, but now I see it > doesn't print anything unless Trace_lwlocks is on (but it still makes more > info available to gdb, as Amit mentioned), so I think that should be OK. > > Since I messed up the gdb session causing the postmaster to SIGKILL all > the children and destroy the evidence, I'll repeat the run compiled with > LOCK_DEBUG and see what it looks like in the morning. > I've gotten the LWLock deadlock again. User backend 24841 holds the WALInsertLocks 7 and is blocked attempting to acquire 6 . So it seems to be violating the lock ordering rules (although I don't see that rule spelled out in xlog.c) The Checkpoint process, meanwhile, has acquired WALInsertLocks 0 through 6 and is blocked on 7. I'm not sure where to go from here. 24841: (gdb) f 2 (gdb) p held_lwlocks $1 = {{lock = 0x7f70e11fae40, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13df080, mode = LW_EXCLUSIVE}, {lock = 0x7f70e11d4280, mode = LW_EXCLUSIVE}, {lock = 0x7f70e11d4280, mode = LW_EXCLUSIVE}, {lock = 0x0, mode = LW_EXCLUSIVE} <repeats 196 times>} (gdb) p T_NAME((LWLock *) held_lwlocks[1]) $2 = 0x84cee1 "WALInsertLocks" (gdb) p T_ID((LWLock *) held_lwlocks[1]) (gdb) p lock $3 = (LWLock *) 0x7f70e13df000 (gdb) p T_NAME((LWLock *) lock) $7 = 0x84cee1 "WALInsertLocks" (gdb) p T_ID((LWLock *) lock) $8 = 6 (gdb) p *(lock) $1 = {mutex = 0 '\000', tranche = 1, state = {value = 1627389952}, nwaiters = {value = 2}, waiters = {head = {prev = 0x7f70e9e29e58, next = 0x7f70e9e2a140}}, owner = 0x7f70e9e2d260} (gdb) p *(lock->owner) $5 = {links = {prev = 0x0, next = 0x0}, sem = {semId = 539754537, semNum = 0}, waitStatus = 0, procLatch = {is_set = 0, is_shared = 1 '\001', owner_pid = 24820}, lxid = 0, pid = 24820, pgprocno = 112, backendId = -1, databaseId = 0, roleId = 0, recoveryConflictPending = 0 '\000', lwWaiting = 1 '\001', lwWaitMode = 0 '\000', lwWaitLink = { prev = 0x7f70e13df090, next = 0x7f70e13df090}, waitLock = 0x0, waitProcLock = 0x0, waitLockMode = 0, heldLocks = 0, waitLSN = 0, syncRepState = 0, syncRepLinks = {prev = 0x0, next = 0x0}, myProcLocks = {{prev = 0x7f70e9e2d2f0, next = 0x7f70e9e2d2f0}, {prev = 0x7f70e9e2d300, next = 0x7f70e9e2d300}, {prev = 0x7f70e9e2d310, next = 0x7f70e9e2d310}, { prev = 0x7f70e9e2d320, next = 0x7f70e9e2d320}, {prev = 0x7f70e9e2d330, next = 0x7f70e9e2d330}, {prev = 0x7f70e9e2d340, next = 0x7f70e9e2d340}, {prev = 0x7f70e9e2d350, next = 0x7f70e9e2d350}, {prev = 0x7f70e9e2d360, next = 0x7f70e9e2d360}, {prev = 0x7f70e9e2d370, next = 0x7f70e9e2d370}, {prev = 0x7f70e9e2d380, next = 0x7f70e9e2d380}, { prev = 0x7f70e9e2d390, next = 0x7f70e9e2d390}, {prev = 0x7f70e9e2d3a0, next = 0x7f70e9e2d3a0}, {prev = 0x7f70e9e2d3b0, next = 0x7f70e9e2d3b0}, {prev = 0x7f70e9e2d3c0, next = 0x7f70e9e2d3c0}, {prev = 0x7f70e9e2d3d0, next = 0x7f70e9e2d3d0}, {prev = 0x7f70e9e2d3e0, next = 0x7f70e9e2d3e0}}, subxids = {xids = {0 <repeats 64 times>}}, backendLock = 0x7f70e13dad00, fpLockBits = 0, fpRelId = {0 <repeats 16 times>}, fpVXIDLock = 0 '\000', fpLocalTransactionId = 0} #0 0x0000003dcb6eaf27 in semop () from /lib64/libc.so.6 #1 0x0000000000671aef in PGSemaphoreLock (sema=0x7f70e9e2a108) at pg_sema.c:387 #2 0x00000000006d79ac in LWLockWaitForVar (lock=0x7f70e13df000, valptr=0x7f70e13df028, oldval=0, newval=0x7fffb33cbd48) at lwlock.c:1406 #3 0x00000000004ee006 in WaitXLogInsertionsToFinish (upto=41590734848) at xlog.c:1576 #4 0x00000000004ee94b in AdvanceXLInsertBuffer (upto=41594920960, opportunistic=<value optimized out>) at xlog.c:1888 #5 0x00000000004f3c42 in GetXLogBuffer (ptr=41594920960) at xlog.c:1669 #6 0x00000000004f40e7 in CopyXLogRecordToWAL (rdata=0x19bc920, fpw_lsn=<value optimized out>) at xlog.c:1313 #7 XLogInsertRecord (rdata=0x19bc920, fpw_lsn=<value optimized out>) at xlog.c:1008 #8 0x00000000004f7c6c in XLogInsert (rmid=13 '\r', info=32 ' ') at xloginsert.c:453 #9 0x000000000047e210 in ginPlaceToPage (btree=0x7fffb33cc070, stack=0x1a5bfe0, insertdata=<value optimized out>, updateblkno=<value optimized out>, childbuf=0, buildStats=0x0) at ginbtree.c:418 #10 0x000000000047f50d in ginInsertValue (btree=0x7fffb33cc070, stack=0x1a5bfe0, insertdata=0x7fffb33cc0f0, buildStats=0x0) at ginbtree.c:748 #11 0x0000000000475deb in ginEntryInsert (ginstate=0x7fffb33cc470, attnum=5544, key=1, category=<value optimized out>, items=0x7f70e10d9140, nitem=51, buildStats=0x0) at gininsert.c:234 #12 0x000000000048602c in ginInsertCleanup (ginstate=0x7fffb33cc470, vac_delay=<value optimized out>, stats=0x0) at ginfast.c:843 #13 0x00000000004871b9 in ginHeapTupleFastInsert (ginstate=0x7fffb33cc470, collector=<value optimized out>) at ginfast.c:436 #14 0x000000000047625a in gininsert (fcinfo=<value optimized out>) at gininsert.c:531 #15 0x00000000007dc483 in FunctionCall6Coll (flinfo=<value optimized out>, collation=<value optimized out>, arg1=<value optimized out>, arg2=<value optimized out>, arg3=<value optimized out>, arg4=<value optimized out>, arg5=140122789534360, arg6=0) at fmgr.c:1436 #16 0x00000000004b2247 in index_insert (indexRelation=0x7f70e1190e60, values=0x7fffb33cef50, isnull=0x7fffb33cf050 "", heap_t_ctid=0x1a4a794, heapRelation=0x7f70e1185a98, checkUnique=UNIQUE_CHECK_NO) at indexam.c:226 #17 0x00000000005d2e67 in ExecInsertIndexTuples (slot=0x1a497f0, tupleid=0x1a4a794, estate=0x1a40768, noDupErr=0 '\000', specConflict=0x0, arbiterIndexes=0x0) at execIndexing.c:384 #18 0x00000000005f0161 in ExecUpdate (tupleid=0x7fffb33cf250, oldtuple=0x0, slot=0x1a497f0, planSlot=0x1a42498, epqstate=0x1a40a70, estate=0x1a40768, canSetTag=1 '\001') at nodeModifyTable.c:978 #19 0x00000000005f0b2a in ExecModifyTable (node=0x1a409d0) at nodeModifyTable.c:1436 #20 0x00000000005d6cc8 in ExecProcNode (node=0x1a409d0) at execProcnode.c:389 #21 0x00000000005d5402 in ExecutePlan (queryDesc=0x1a2b220, direction=<value optimized out>, count=0) at execMain.c:1549 #22 standard_ExecutorRun (queryDesc=0x1a2b220, direction=<value optimized out>, count=0) at execMain.c:337 #23 0x00007f70ea39af3b in pgss_ExecutorRun (queryDesc=0x1a2b220, direction=ForwardScanDirection, count=0) at pg_stat_statements.c:881 #24 0x00000000006ea38f in ProcessQuery (plan=0x1a2b3d8, sourceText=0x1a2af68 "update foo set count=count+1 where text_array @> $1", params=0x1a2afe8, dest=<value optimized out>, completionTag=0x7fffb33cf660 "") at pquery.c:185 #25 0x00000000006ea5ec in PortalRunMulti (portal=0x19a6128, isTopLevel=1 '\001', dest=0xc55020, altdest=0xc55020, completionTag=0x7fffb33cf660 "") at pquery.c:1279 #26 0x00000000006eadb3 in PortalRun (portal=0x19a6128, count=9223372036854775807, isTopLevel=1 '\001', dest=0x1990630, altdest=0x1990630, completionTag=0x7fffb33cf660 "") at pquery.c:816 #27 0x00000000006e6ffb in exec_execute_message (portal_name=0x1990218 "", max_rows=9223372036854775807) at postgres.c:1988 #28 0x00000000006e8c15 in PostgresMain (argc=<value optimized out>, argv=<value optimized out>, dbname=0x19a3210 "jjanes", username=<value optimized out>) at postgres.c:4088 #29 0x00000000006855dd in BackendRun (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:4159 #30 BackendStartup (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:3835 #31 ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1609 #32 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1254 #33 0x0000000000610ab0 in main (argc=4, argv=0x1976cf0) at main.c:221 24820: (gdb) p held_lwlocks $1 = {{lock = 0x7f70e11d4340, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13ded00, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13ded80, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13dee00, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13dee80, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13def00, mode = LW_EXCLUSIVE}, {lock = 0x7f70e13def80, mode = LW_EXCLUSIVE}, { lock = 0x7f70e13df000, mode = LW_EXCLUSIVE}, {lock = 0x0, mode = LW_EXCLUSIVE} <repeats 192 times>} (gdb) p lock $3 = (LWLock *) 0x7f70e13df080 (gdb) p *lock $4 = {mutex = 0 '\000', tranche = 1, state = {value = 1627389952}, nwaiters = {value = 1}, waiters = {head = {prev = 0x7f70e9e2d2a8, next = 0x7f70e9e2d2a8}}, owner = 0x7f70e9e2a0f8} (gdb) p *(lock->owner) $5 = {links = {prev = 0x0, next = 0x0}, sem = {semId = 539688999, semNum = 15}, waitStatus = 0, procLatch = {is_set = 1, is_shared = 1 '\001', owner_pid = 24841}, lxid = 5523, pid = 24841, pgprocno = 95, backendId = 9, databaseId = 16384, roleId = 10, recoveryConflictPending = 0 '\000', lwWaiting = 1 '\001', lwWaitMode = 2 '\002', lwWaitLink = { prev = 0x7f70e13df010, next = 0x7f70e9e29e58}, waitLock = 0x0, waitProcLock = 0x0, waitLockMode = 0, heldLocks = 0, waitLSN = 0, syncRepState = 0, syncRepLinks = {prev = 0x0, next = 0x0}, myProcLocks = {{prev = 0x7f70e9e2a188, next = 0x7f70e9e2a188}, {prev = 0x7f70e9e2a198, next = 0x7f70e9e2a198}, {prev = 0x7f70e9e2a1a8, next = 0x7f70e9e2a1a8}, { prev = 0x7f70e9bfe060, next = 0x7f70e9bfe060}, {prev = 0x7f70e9e2a1c8, next = 0x7f70e9e2a1c8}, {prev = 0x7f70e9e2a1d8, next = 0x7f70e9e2a1d8}, {prev = 0x7f70e9e2a1e8, next = 0x7f70e9e2a1e8}, {prev = 0x7f70e9e2a1f8, next = 0x7f70e9e2a1f8}, {prev = 0x7f70e9e2a208, next = 0x7f70e9e2a208}, {prev = 0x7f70e9e2a218, next = 0x7f70e9e2a218}, { prev = 0x7f70e9e2a228, next = 0x7f70e9e2a228}, {prev = 0x7f70e9e2a238, next = 0x7f70e9e2a238}, {prev = 0x7f70e9e2a248, next = 0x7f70e9e2a248}, {prev = 0x7f70e9e2a258, next = 0x7f70e9e2a258}, {prev = 0x7f70e9e2a268, next = 0x7f70e9e2a268}, {prev = 0x7f70e9e2a278, next = 0x7f70e9e2a278}}, subxids = {xids = {0 <repeats 64 times>}}, backendLock = 0x7f70e13da8c0, fpLockBits = 160528697655296, fpRelId = {0, 0, 0, 0, 0, 0, 0, 0, 0, 2679, 2610, 16414, 16412, 16416, 16415, 16409}, fpVXIDLock = 1 '\001', fpLocalTransactionId = 5523} (gdb) bt #0 0x0000003dcb6eaf27 in semop () from /lib64/libc.so.6 #1 0x0000000000671aef in PGSemaphoreLock (sema=0x7f70e9e2d270) at pg_sema.c:387 #2 0x00000000006d70ac in LWLockAcquireCommon (lock=0x7f70e13df080, mode=LW_EXCLUSIVE, valptr=0x7f70e13df0a8, val=0) at lwlock.c:1042 #3 0x00000000004e6dab in WALInsertLockAcquireExclusive () at xlog.c:1449 #4 0x00000000004ef025 in CreateCheckPoint (flags=64) at xlog.c:8177 #5 0x00000000006789d3 in CheckpointerMain () at checkpointer.c:493 #6 0x00000000004fecec in AuxiliaryProcessMain (argc=2, argv=0x7fffb33cf1d0) at bootstrap.c:428 #7 0x0000000000680697 in StartChildProcess (type=CheckpointerProcess) at postmaster.c:5143 #8 0x0000000000682b7a in reaper (postgres_signal_arg=<value optimized out>) at postmaster.c:2656 #9 <signal handler called> #10 0x0000003dcb6e1353 in __select_nocancel () from /lib64/libc.so.6 #11 0x00000000006849bf in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1573 #12 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1254 #13 0x0000000000610ab0 in main (argc=4, argv=0x1976cf0) at main.c:221 > > Thanks, > > Jeff > > >