We've been running across a fair amount of haproxy processes lately that
won't shut down. We're currently using 1.7.5, but have also experienced
the issue with earlier versions, 1.7.2 for sure, but likely back even
further.
The processes are getting signaled to shut down by the
haproxy-systemd-wrapper after sending it a SIGHUP.
The last thing logged by the process was all the "Stopping frontend"
"Stopping backend" and "Proxy XXX stopped" messages.

When I do an `lsof -p XXX` I get:

# lsof -p 28856
COMMAND   PID USER   FD      TYPE             DEVICE SIZE/OFF       NODE
NAME
haproxy 28856 root  cwd       DIR              253,0     4096        128 /
haproxy 28856 root  rtd       DIR              253,0     4096        128 /
haproxy 28856 root  txt       REG              253,0  1562240   25168059
/usr/sbin/haproxy
haproxy 28856 root  DEL       REG                0,4           420037375
/dev/zero
haproxy 28856 root  mem       REG              253,0    62184   26659777
/usr/lib64/libnss_files-2.17.so
haproxy 28856 root  mem       REG              253,0   155744   25213445
/usr/lib64/libselinux.so.1
haproxy 28856 root  mem       REG              253,0   111080   26659787
/usr/lib64/libresolv-2.17.so
haproxy 28856 root  mem       REG              253,0    15688   25315637
/usr/lib64/libkeyutils.so.1.5
haproxy 28856 root  mem       REG              253,0    62744   25394528
/usr/lib64/libkrb5support.so.0.1
haproxy 28856 root  mem       REG              253,0   143944   26659785
/usr/lib64/libpthread-2.17.so
haproxy 28856 root  mem       REG              253,0   202568   25300495
/usr/lib64/libk5crypto.so.3.1
haproxy 28856 root  mem       REG              253,0    15848   25213462
/usr/lib64/libcom_err.so.2.1
haproxy 28856 root  mem       REG              253,0   959008   25394526
/usr/lib64/libkrb5.so.3.3
haproxy 28856 root  mem       REG              253,0   324888   25300491
/usr/lib64/libgssapi_krb5.so.2.2
haproxy 28856 root  mem       REG              253,0    11384   25167850
/usr/lib64/libfreebl3.so
haproxy 28856 root  mem       REG              253,0  2118128   25167885
/usr/lib64/libc-2.17.so
haproxy 28856 root  mem       REG              253,0   398264   25195400
/usr/lib64/libpcre.so.1.2.0
haproxy 28856 root  mem       REG              253,0    11112   25195408
/usr/lib64/libpcreposix.so.0.0.1
haproxy 28856 root  mem       REG              253,0  1141928   26148751
/usr/lib64/libm-2.17.so
haproxy 28856 root  mem       REG              253,0  2025472   25300659
/usr/lib64/libcrypto.so.1.0.1e
haproxy 28856 root  mem       REG              253,0   454024   25300661
/usr/lib64/libssl.so.1.0.1e
haproxy 28856 root  mem       REG              253,0    19776   26148750
/usr/lib64/libdl-2.17.so
haproxy 28856 root  mem       REG              253,0    90664   25213451
/usr/lib64/libz.so.1.2.7
haproxy 28856 root  mem       REG              253,0    41080   25167891
/usr/lib64/libcrypt-2.17.so
haproxy 28856 root  mem       REG              253,0   155464   26148745
/usr/lib64/ld-2.17.so
haproxy 28856 root    0u  a_inode                0,9        0       5823
[eventpoll]
haproxy 28856 root    1u     IPv4          420797940      0t0        TCP
10.0.33.145:35754->10.0.33.147:1029 (CLOSE_WAIT)
haproxy 28856 root    2u     IPv4          420266351      0t0        TCP
10.0.33.145:52898->10.0.33.147:1029 (CLOSE_WAIT)
haproxy 28856 root    3r      REG                0,3        0 4026531956 net
haproxy 28856 root    4u     IPv4          422150834      0t0        TCP
10.0.33.145:38874->10.0.33.147:1029 (CLOSE_WAIT)
haproxy 28856 root    5r      REG                0,3        0 4026532437 net
haproxy 28856 root    6r      REG                0,3        0 4026531956 net
haproxy 28856 root   13u     unix 0xffff88009af6e800      0t0  420037384
socket

All those sockets have been sitting there like that for a long time.
The :1029 sockets are "peer" sync connections.
File descriptor 13 is likely one of:
* The syslog connection to /dev/log
* A dead connection from an SSL worker process. We use nbproc>1 with
dedicated processes handling SSL termination, and then unix domain
sockets to forward to the main haproxy process. PID 28856 is the main
process, not an SSL terminator. The SSL terminator processes are already
shut down, so there's nothing on the other end of that socket.
I'm not sure what the other "net" sockets are.



When I `strace -p XXX` I get:

# strace -p 28856
Process 28856 attached
epoll_wait(0, {}, 200, 319)             = 0
epoll_wait(0, {}, 200, 0)               = 0
epoll_wait(0, {}, 200, 362)             = 0
epoll_wait(0, {}, 200, 0)               = 0
epoll_wait(0, {}, 200, 114)             = 0
epoll_wait(0, {}, 200, 0)               = 0
epoll_wait(0, {}, 200, 203)             = 0
epoll_wait(0, {}, 200, 0)               = 0
epoll_wait(0, {}, 200, 331)             = 0
epoll_wait(0, {}, 200, 0)               = 0



When I do `bt full` in gdb I get:

(gdb) bt full
#0  0x00007f5f3efdacf3 in __epoll_wait_nocancel () from /lib64/libc.so.6
No symbol table info available.
#1  0x00007f5f409a4c7c in _do_poll (p=<optimized out>, exp=910827830) at
src/ev_epoll.c:125
        status = <optimized out>
        eo = <optimized out>
        fd = <optimized out>
        opcode = <optimized out>
        count = <optimized out>
        updt_idx = <optimized out>
        wait_time = 831
#2  0x00007f5f409052d8 in run_poll_loop () at src/haproxy.c:1741
        next = <optimized out>
#3  0x00007f5f409014fd in main (argc=<optimized out>, argv=<optimized
out>) at src/haproxy.c:2104
        err = <optimized out>
        retry = <optimized out>
        limit = {rlim_cur = 131149, rlim_max = 131149}
        errmsg =
"\000\000\000\000\000\000\000\000\274/\366>_\177\000\000[\001\000\000\000\000\000\000\030\000\000\000\000\000\000\000nÚ·?_\177\000\000\223\065\247?_\177\000\000\020\006\006@_\177\000\000`\212\305@_\177\000\000\020\006\006@_\177\000\000\260\340\305@_\177\000\000\300L\274\230\374\177\000\000{\353\256?_\177\000\000>\001\000\024"
        pidfd = <optimized out>



When I look at the /proc/XXX/fdinfo/0 (the epoll file descriptor) I get:

# cat /proc/28856/fdinfo/0
pos:    0
flags:    02
mnt_id:    10

Note that there are no file descriptors listed, so the epoll handle is
empty.

I can provide the config if desired, but it's very large, and I'll have
to strip info out of it.


-Patrick

Reply via email to