Hi hackers!

Some time ago I've seen a hanging logical replication that was trying to send 
transaction commit after doing table pg_repack.
I understand that those things do not mix well. Yet walsender was ignoring 
pg_terminate_backend() and I think this worth fixing.
Can we add CHECK_FOR_INTERRUPTS(); somewhere in this backtrace?  Full session 
is attaches as file.

#0  pfree (pointer=0x561850bbee40) at 
./build/../src/backend/utils/mmgr/mcxt.c:1032
#1  0x00005617712530d6 in ReorderBufferReturnTupleBuf (tuple=<optimized out>, 
rb=<optimized out>) at 
./build/../src/backend/replication/logical/reorderbuffer.c:469
#2  ReorderBufferReturnChange (rb=<optimized out>, change=0x561772456048) at 
./build/../src/backend/replication/logical/reorderbuffer.c:398
#3  0x0000561771253da1 in ReorderBufferRestoreChanges 
(rb=rb@entry=0x561771c14e10, txn=0x561771c0b078, 
file=file@entry=0x561771c15168, segno=segno@entry=0x561771c15178) at 
./build/../src/backend/replication/logical/reorderbuffer.c:2570
#4  0x00005617712553ba in ReorderBufferIterTXNNext (state=0x561771c15130, 
rb=0x561771c14e10) at 
./build/../src/backend/replication/logical/reorderbuffer.c:1146
#5  ReorderBufferCommit (rb=0x561771c14e10, xid=xid@entry=2976347782, 
commit_lsn=79160378448744, end_lsn=<optimized out>, 
commit_time=commit_time@entry=686095734290578, origin_id=origin_id@entry=0, 
origin_lsn=0) at ./build/../src/backend/replication/logical/reorderbuffer.c:1523
#6  0x000056177124a30a in DecodeCommit (xid=2976347782, parsed=0x7ffc3cb4c240, 
buf=0x7ffc3cb4c400, ctx=0x561771b10850) at 
./build/../src/backend/replication/logical/decode.c:640
#7  DecodeXactOp (ctx=0x561771b10850, buf=buf@entry=0x7ffc3cb4c400) at 
./build/../src/backend/replication/logical/decode.c:248
#8  0x000056177124a6a9 in LogicalDecodingProcessRecord (ctx=0x561771b10850, 
record=0x561771b10ae8) at 
./build/../src/backend/replication/logical/decode.c:117
#9  0x000056177125d1e5 in XLogSendLogical () at 
./build/../src/backend/replication/walsender.c:2893
#10 0x000056177125f5f2 in WalSndLoop (send_data=send_data@entry=0x56177125d180 
<XLogSendLogical>) at ./build/../src/backend/replication/walsender.c:2242
#11 0x0000561771260125 in StartLogicalReplication (cmd=<optimized out>) at 
./build/../src/backend/replication/walsender.c:1179
#12 exec_replication_command (cmd_string=cmd_string@entry=0x561771abe590 
"START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 ( 
\"include-timestamp\" '1', \"include-types\" '1', \"include-xids\" '1', 
\"write-in-chunks\" '1', \"add-tables\" '/* sanitized 
*/.claim_audit,public.__consu"...) at 
./build/../src/backend/replication/walsender.c:1612
#13 0x00005617712b2334 in PostgresMain (argc=<optimized out>, 
argv=argv@entry=0x561771b2a438, dbname=<optimized out>, username=<optimized 
out>) at ./build/../src/backend/tcop/postgres.c:4267
#14 0x000056177123857c in BackendRun (port=0x561771b0d7a0, port=0x561771b0d7a0) 
at ./build/../src/backend/postmaster/postmaster.c:4484
#15 BackendStartup (port=0x561771b0d7a0) at 
./build/../src/backend/postmaster/postmaster.c:4167
#16 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1725
#17 0x000056177123954b in PostmasterMain (argc=9, argv=0x561771ab70e0) at 
./build/../src/backend/postmaster/postmaster.c:1398
#18 0x0000561770fae8b6 in main (argc=9, argv=0x561771ab70e0) at 
./build/../src/backend/main/main.c:228

What do you think?

Thank you!


Best regards, Andrey Borodin.

Attachment: check_for_interrupts.diff
Description: Binary data


Time: 0.731 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
             slot_name              |  plugin  | slot_type | datoid |   
database   | temporary | active | active_pid |    xmin    | catalog_xmin |  
restart_lsn  | confirmed_flush_lsn 
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
 /* sanitized */ | [null]   | physical  | [null] | [null]       | f         | t 
     |    1719461 | 2980736032 |       [null] | 4818/908C9B98 | [null]
 vla_/* sanitized */_db_yandex_net | [null]   | physical  | [null] | [null]     
  | f         | t      |    1719460 | 2980736032 |       [null] | 4818/908DE000 
| [null]
 dttsjtaa66crdhbm015h               | wal2json | logical   |  16441 | /* 
sanitized */ | f         | t      |    3697390 |     [null] |   2976347004 | 
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)

Time: 0.454 ms
sas-/* sanitized *//postgres M # select pg_terminate_backend(3697390);
 pg_terminate_backend 
----------------------
 t
(1 row)

Time: 0.189 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
             slot_name              |  plugin  | slot_type | datoid |   
database   | temporary | active | active_pid |    xmin    | catalog_xmin |  
restart_lsn  | confirmed_flush_lsn 
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
 man_/* sanitized */_db_yandex_net | [null]   | physical  | [null] | [null]     
  | f         | t      |    1719461 | 2980737826 |       [null] | 4818/92EC5638 
| [null]
 vla_/* sanitized */_db_yandex_net | [null]   | physical  | [null] | [null]     
  | f         | t      |    1719460 | 2980737682 |       [null] | 4818/92ED3C18 
| [null]
 dttsjtaa66crdhbm015h               | wal2json | logical   |  16441 | /* 
sanitized */ | f         | t      |    3697390 |     [null] |   2976347004 | 
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)

Time: 0.371 ms
sas-/* sanitized *//postgres M # select pg_terminate_backend(3697390);
 pg_terminate_backend 
----------------------
 t
(1 row)

Time: 0.165 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
             slot_name              |  plugin  | slot_type | datoid |   
database   | temporary | active | active_pid |    xmin    | catalog_xmin |  
restart_lsn  | confirmed_flush_lsn 
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
 man_/* sanitized */_db_yandex_net | [null]   | physical  | [null] | [null]     
  | f         | t      |    1719461 | 2980741177 |       [null] | 4818/9A95E000 
| [null]
 vla_/* sanitized */_db_yandex_net | [null]   | physical  | [null] | [null]     
  | f         | t      |    1719460 | 2980741253 |       [null] | 4818/9A978108 
| [null]
 dttsjtaa66crdhbm015h               | wal2json | logical   |  16441 | /* 
sanitized */ | f         | t      |    3697390 |     [null] |   2976347004 | 
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)

Time: 0.578 ms
sas-/* sanitized *//postgres M # select * from pg_stat_activity where pid = 
'3697390';
-[ RECORD 1 
]----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
datid            | 16441
datname          | /* sanitized */
pid              | 3697390
usesysid         | 4819584
usename          | data_transfer
application_name | 
client_addr      | 127.0.0.1
client_hostname  | localhost
client_port      | 45966
backend_start    | 2021-09-28 11:24:28.812267+03
xact_start       | [null]
query_start      | 2021-09-28 11:39:24.415415+03
state_change     | 2021-09-28 11:39:24.415438+03
wait_event_type  | IO
wait_event       | ReorderBufferRead
state            | active
backend_xid      | [null]
backend_xmin     | [null]
query            | START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 ( 
"include-timestamp" '1', "include-types" '1', "include-xids" '1', 
"write-in-chunks" '1', "add-tables" '/* sanitized */./* sanitized */,public./* 
sanitized */', "filter-tables" 'public.repl_mon' )
backend_type     | walsender

Time: 3.155 ms
sas-/* sanitized *//postgres M # select * from pg_stat_activity where pid = 
'3697390';
-[ RECORD 1 
]----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
datid            | 16441
datname          | /* sanitized */
pid              | 3697390
usesysid         | 4819584
usename          | data_transfer
application_name | 
client_addr      | 127.0.0.1
client_hostname  | localhost
client_port      | 45966
backend_start    | 2021-09-28 11:24:28.812267+03
xact_start       | [null]
query_start      | 2021-09-28 11:39:24.415415+03
state_change     | 2021-09-28 11:39:24.415438+03
wait_event_type  | IO
wait_event       | ReorderBufferRead
state            | active
backend_xid      | [null]
backend_xmin     | [null]
query            | START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 ( 
"include-timestamp" '1', "include-types" '1', "include-xids" '1', 
"write-in-chunks" '1', "add-tables" '/* sanitized */./* sanitized */,public./* 
sanitized */', "filter-tables" 'public.repl_mon' )
backend_type     | walsender

Time: 2.931 ms
sas-/* sanitized *//postgres M # select 
pg_drop_replication_slot('dttsjtaa66crdhbm015h');
ERROR:  55006: replication slot "dttsjtaa66crdhbm015h" is active for PID 3697390
LOCATION:  ReplicationSlotAcquire, slot.c:399
Time: 0.309 ms
sas-/* sanitized *//postgres M # ^C
sas-/* sanitized *//postgres M # \q
[PROD]root@sas-/* sanitized */ ~ # bt -p 3697390
Tue Sep 28 12:16:16 2021
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
pfree (pointer=0x561850bbee40) at ./build/../src/backend/utils/mmgr/mcxt.c:1032
#0  pfree (pointer=0x561850bbee40) at 
./build/../src/backend/utils/mmgr/mcxt.c:1032
#1  0x00005617712530d6 in ReorderBufferReturnTupleBuf (tuple=<optimized out>, 
rb=<optimized out>) at 
./build/../src/backend/replication/logical/reorderbuffer.c:469
#2  ReorderBufferReturnChange (rb=<optimized out>, change=0x561772456048) at 
./build/../src/backend/replication/logical/reorderbuffer.c:398
#3  0x0000561771253da1 in ReorderBufferRestoreChanges 
(rb=rb@entry=0x561771c14e10, txn=0x561771c0b078, 
file=file@entry=0x561771c15168, segno=segno@entry=0x561771c15178) at 
./build/../src/backend/replication/logical/reorderbuffer.c:2570
#4  0x00005617712553ba in ReorderBufferIterTXNNext (state=0x561771c15130, 
rb=0x561771c14e10) at 
./build/../src/backend/replication/logical/reorderbuffer.c:1146
#5  ReorderBufferCommit (rb=0x561771c14e10, xid=xid@entry=2976347782, 
commit_lsn=79160378448744, end_lsn=<optimized out>, 
commit_time=commit_time@entry=686095734290578, origin_id=origin_id@entry=0, 
origin_lsn=0) at ./build/../src/backend/replication/logical/reorderbuffer.c:1523
#6  0x000056177124a30a in DecodeCommit (xid=2976347782, parsed=0x7ffc3cb4c240, 
buf=0x7ffc3cb4c400, ctx=0x561771b10850) at 
./build/../src/backend/replication/logical/decode.c:640
#7  DecodeXactOp (ctx=0x561771b10850, buf=buf@entry=0x7ffc3cb4c400) at 
./build/../src/backend/replication/logical/decode.c:248
#8  0x000056177124a6a9 in LogicalDecodingProcessRecord (ctx=0x561771b10850, 
record=0x561771b10ae8) at 
./build/../src/backend/replication/logical/decode.c:117
#9  0x000056177125d1e5 in XLogSendLogical () at 
./build/../src/backend/replication/walsender.c:2893
#10 0x000056177125f5f2 in WalSndLoop (send_data=send_data@entry=0x56177125d180 
<XLogSendLogical>) at ./build/../src/backend/replication/walsender.c:2242
#11 0x0000561771260125 in StartLogicalReplication (cmd=<optimized out>) at 
./build/../src/backend/replication/walsender.c:1179
#12 exec_replication_command (cmd_string=cmd_string@entry=0x561771abe590 
"START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 ( 
\"include-timestamp\" '1', \"include-types\" '1', \"include-xids\" '1', 
\"write-in-chunks\" '1', \"add-tables\" '/* sanitized 
*/.claim_audit,public.__consu"...) at 
./build/../src/backend/replication/walsender.c:1612
#13 0x00005617712b2334 in PostgresMain (argc=<optimized out>, 
argv=argv@entry=0x561771b2a438, dbname=<optimized out>, username=<optimized 
out>) at ./build/../src/backend/tcop/postgres.c:4267
#14 0x000056177123857c in BackendRun (port=0x561771b0d7a0, port=0x561771b0d7a0) 
at ./build/../src/backend/postmaster/postmaster.c:4484
#15 BackendStartup (port=0x561771b0d7a0) at 
./build/../src/backend/postmaster/postmaster.c:4167
#16 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1725
#17 0x000056177123954b in PostmasterMain (argc=9, argv=0x561771ab70e0) at 
./build/../src/backend/postmaster/postmaster.c:1398
#18 0x0000561770fae8b6 in main (argc=9, argv=0x561771ab70e0) at 
./build/../src/backend/main/main.c:228

Reply via email to