Hi Hackers, We encounter one problem happened while we try to promote standby postgres(version 9.6.9) server to active. From the trace(we triggered the process abort). We can see the process was hang in syslog() while handling SIGTERM. According to below article. Looks it is risky to write syslog in signal handling. Any idea to avoid it?
https://cwe.mitre.org/data/definitions/479.html [New LWP 7741] b[Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `postgres: wal receiver process '. Program terminated with signal SIGABRT, Aborted. #0 0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6 Missing separate debuginfos, use: dnf debuginfo-install postgresql-server-9.6.9-3.wf28.x86_64 (gdb) bt full #0 0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6 No symbol table info available. #1 0x00007ff25121aa7b in __vsyslog_chk () from /lib64/libc.so.6 No symbol table info available. #2 0x00007ff25121ae5c in syslog () from /lib64/libc.so.6 No symbol table info available. #3 0x0000000000705e88 in write_syslog (level=3, line=0x2b19cd8 "FATAL: terminating walreceiver process due to administrator command\n") at elog.c:2033 buf = "FATAL: terminating walreceiver process due to administrator command\000\200\310\a\000\000\000\000\000U\211*Q\362\177\000\000\000\000\000\000\000\000\000\000P\317\033\373\377\177", '\000' <repeats 42 times>, "\020\272\261\002\000\000\000\000\020", '\000' <repeats 11 times>, "\001", '\000' <repeats 35 times>, "\240\311\033"... buflen = 68 i = <optimized out> chunk_nr = <optimized out> seq = 4 len = 69 nlpos = 0x2b19d1c "\n" #4 0x0000000000707105 in send_message_to_server_log (edata=0xb4ded8 <errordata+184>) at elog.c:3003 syslog_level = <optimized out> buf = {data = 0x2b19cd8 "FATAL: terminating walreceiver process due to administrator command\n", len = 69, maxlen = 1024, cursor = 0} buf = <optimized out> syslog_level = <optimized out> #5 EmitErrorReport () at elog.c:1479 edata = 0xb4ded8 <errordata+184> oldcontext = <optimized out> #6 0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483 edata = 0xb4ded8 <errordata+184> elevel = 21 oldcontext = <optimized out> econtext = 0x0 __func__ = "errfinish" #7 0x0000000000622e58 in ProcessWalRcvInterrupts () at walreceiver.c:171 No locals. #8 0x0000000000622e9f in WalRcvShutdownHandler (postgres_signal_arg=<optimized out>) at walreceiver.c:821 save_errno = 0 #9 <signal handler called> No symbol table info available. #10 0x00007ff2512218fb in send () from /lib64/libc.so.6 No symbol table info available. #11 0x00007ff25121ab72 in __vsyslog_chk () from /lib64/libc.so.6 No symbol table info available. #12 0x00007ff25121ae5c in syslog () from /lib64/libc.so.6 No symbol table info available. #13 0x0000000000705e88 in write_syslog (level=3, line=0x2b19929 "\t\tIs the server running on host \"192.168.1.8\" and accepting\n\t\tTCP/IP connections on port 5432?\n\t\n") at elog.c:2033 buf = "\t\tIs the server running on host \"192.168.1.8\" and accepting\000onnect to server: Connection refused\000\377\177\000\000x\376\263\002", '\000' <repeats 12 times>, "\003\000\000\000\000\000\000\000\000\300\rR\362\177\000\000\377\377\377\377\000\000\000\000\343\030bm\225k\330\001\270\267\337E\362\177\000\000\360\372\263\002\000\000\000\000@\324\033\373\377\177\000\000\000\267\261\030\275\036y=`\324\033\373\377\177\000\000`\324\033\373\377\177\000\000\220\325\033"... buflen = 59 i = <optimized out> chunk_nr = <optimized out> seq = 4 len = 97 nlpos = 0x2b19964 "\n\t\tTCP/IP connections on port 5432?\n\t\n" #14 0x0000000000707105 in send_message_to_server_log (edata=0xb4de20 <errordata>) at elog.c:3003 syslog_level = <optimized out> buf = {data = 0x2b198c8 "terminating walreceiver process due to administrator command", len = 194, maxlen = 1024, cursor = 0} buf = <optimized out> syslog_level = <optimized out> #15 EmitErrorReport () at elog.c:1479 edata = 0xb4de20 <errordata> oldcontext = <optimized out> #16 0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483 edata = 0xb4de20 <errordata> elevel = 21 oldcontext = <optimized out> econtext = 0x0 __func__ = "errfinish" #17 0x00007ff245dfcc90 in libpqrcv_connect () from /usr/lib/libpqwalreceiver.so No symbol table info available. #18 0x000000000062352b in WalReceiverMain () at walreceiver.c:314 conninfo = "host=192.168.1.8 port=5432 user=_nokfssystestpostgres\000ION\000\354Q\362\177\000\000\001", '\000' <repeats 16 times>, "[\fR\362\177\000\000\250o\023Q\362\177\000\000\060\335\033\373\377\177\000\000:\274\354Q\362\177\000\000x", '\000' <repeats 16 times>, "\020NQ\362\177\000\000x\000\000\000\000\000\000\000\222\335\033\373\377\177\000\000=\245*Q\362\177\000\000\222\335\033\373\377\177\000\000:\274\354Q\362\177\000\000=\036", '\000' <repeats 14 times>, "\001\000\000\000\000\000\000\000"... tmp_conninfo = <optimized out> slotname = "\000\000\000\000\000\000\000\000?\002", '\000' <repeats 12 times>, " \264MQ\362\177\000\000\t\000\000\000\000\000\000\000\260\335\033\373\377\177\000\000\060?\002\000\000\000\000\316l\031Q\362\177\000" startpoint = 578813952 startpointTLI = 55 primaryTLI = 1360676704 first_stream = <optimized out> walrcv = 0x7ff24fc0a500 last_recv_timestamp = <optimized out> now = 582800572427736 ping_sent = <optimized out> __func__ = "WalReceiverMain" #19 0x00000000004d56f7 in AuxiliaryProcessMain (argc=argc@entry=2, argv=argv@entry=0x7ffffb1bddb0) at bootstrap.c:440 progname = 0x73f59e "postgres" flag = -1 userDoption = 0x0 __func__ = "AuxiliaryProcessMain" #20 0x00000000006096ea in StartChildProcess (type=WalReceiverProcess) at postmaster.c:5300 pid = <optimized out> av = {0x73f59e "postgres", 0x7ffffb1bdd90 "-x6", 0x0, 0x1dc0 <error: Cannot access memory at address 0x1dc0>, 0x2b3c850 "\220\061\264\002", 0x7ff2520c5b00 "", 0x4 <error: Cannot access memory at address 0x4>, 0x0, 0x2b3c830 "\220\061\264\002", 0x60a781 <reaper+1490> "E\211,$H\203\304\020[]A\\A]A^?\005\310DW"} ac = 2 typebuf = "-x6", '\000' <repeats 21 times>, "\306jp\000\000\000\000" save_errno = <optimized out> #21 0x0000000000609e3f in MaybeStartWalReceiver () at postmaster.c:5447 No locals. #22 0x000000000060aae3 in sigusr1_handler (postgres_signal_arg=<optimized out>) at postmaster.c:5084 save_errno = 4 __func__ = "sigusr1_handler" #23 <signal handler called> No symbol table info available. #24 0x00007ff251217f57 in select () from /lib64/libc.so.6 No symbol table info available. #25 0x000000000046ec0a in ServerLoop () at postmaster.c:1685 timeout = {tv_sec = 58, tv_usec = 886165} rmask = {fds_bits = {40, 0 <repeats 15 times>}} selres = <optimized out> now = <optimized out> readmask = {fds_bits = {40, 0 <repeats 15 times>}} nSockets = <optimized out> last_lockfile_recheck_time = 1529485360 last_touch_time = 1529485360 __func__ = "ServerLoop" #26 0x000000000060b94b in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x2b176f0) at postmaster.c:1329 opt = <optimized out> status = <optimized out> userDoption = <optimized out> listen_addr_saved = <optimized out> i = <optimized out> output_config_variable = <optimized out> __func__ = "PostmasterMain" #27 0x000000000047001e in main (argc=3, argv=0x2b176f0) at main.c:228 No locals. Br. :) Chen Yan(Jack)