Hi again,
I verified that local DoS did not cause that high cpu usage. I'm upgrading
from 1.8.19 to 1.9.6.
Please look at strace:
of process with 100% cpu usage:
strace -fp 9790 -c
Process 9790 attached
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
27.51 0.009501 0 942536 clock_gettime
13.65 0.004716 0 471268 epoll_wait
12.09 0.004176 0 305211 157644 recvfrom
11.09 0.003831 0 109003 64867 connect
9.04 0.003122 0 141908 6479 sendto
6.55 0.002264 0 193372 epoll_ctl
5.05 0.001746 0 57430 close
4.46 0.001539 0 118067 setsockopt
3.05 0.001053 0 94756 1860 getsockopt
2.55 0.000882 0 50009 socket
1.43 0.000493 0 50009 fcntl
1.32 0.000457 0 14947 7353 accept4
0.89 0.000306 0 11660 1 shutdown
0.58 0.000199 0 12172 3226 read
0.47 0.000164 0 3838 279 write
0.21 0.000074 0 6640 bind
0.06 0.000019 0 1860 getsockname
0.00 0.000000 0 4 brk
0.00 0.000000 0 24 sendmsg
------ ----------- ----------- --------- --------- ----------------
100.00 0.034542 2584714 241709 total
fragment of strace:
[...]
epoll_ctl(3, EPOLL_CTL_ADD, 2020, {EPOLLOUT, {u32=2020, u64=2020}}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87792876}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87804387}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87811541}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87823913}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87830810}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87842796}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87849448}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87859750}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87866006}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87877936}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87885329}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87897045}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87904108}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87915961}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87922580}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87933941}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87940258}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87952045}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87959070}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87971104}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87978136}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87990210}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 87998045}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88009585}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88015976}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88027627}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88034407}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88046698}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88053419}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88065381}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88071551}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88082999}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88089594}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88112393}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88118910}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88130295}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88136707}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88148429}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88155219}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88167019}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88173660}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88185332}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88191732}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88203660}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88210542}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88223145}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88230090}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88242627}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88250786}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88262976}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88270439}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88282001}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88288596}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88299666}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88305992}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88316967}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88324161}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88335865}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88342721}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88354743}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88361932}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88385864}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88392851}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88405104}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88411895}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88423943}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88430569}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88442977}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88450051}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88462006}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88468393}) = 0
epoll_wait(3, {}, 200, 0) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88479825}) = 0
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88497719}) = 0
epoll_wait(3, {{EPOLLIN, {u32=1142, u64=1142}}}, 200, 0) = 1
clock_gettime(CLOCK_THREAD_CPUTIME_ID, {848, 88510556}) = 0
[...]
After some time I have 2 then 3 haproxy processes with 100% cpu.
This is strace of process with normal 10% cpu usage:
strace -fp 9809 -c
Process 9809 attached
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
38.20 0.020981 0 615163 epoll_wait
15.60 0.008565 0 1230326 clock_gettime
10.32 0.005667 0 110707 66703 connect
8.51 0.004673 0 391704 201233 recvfrom
7.82 0.004292 0 14450 7157 accept4
7.48 0.004106 0 183790 6254 sendto
3.48 0.001909 0 57090 close
3.02 0.001658 0 240248 epoll_ctl
1.66 0.000914 0 50016 socket
1.43 0.000786 0 117936 setsockopt
1.21 0.000666 0 97594 1660 getsockopt
0.51 0.000281 0 50016 fcntl
0.42 0.000229 0 12275 shutdown
0.22 0.000119 0 13236 3752 read
0.08 0.000043 0 4169 280 write
0.03 0.000016 0 6313 bind
0.02 0.000012 0 1660 getsockname
0.00 0.000000 0 4 brk
------ ----------- ----------- --------- --------- ----------------
100.00 0.054917 3196697 287039 total
Any guidance how to debug it further would be very helpful.
czw., 4 kwi 2019 o 14:56 Maciej Zdeb <[email protected]> napisał(a):
> I'm very sorry, it is probably "real" traffic that haproxy needs to
> handle. Some local DDoS occurred just in time when I upgraded to 1.9 (what
> terribly bad luck).
>
> Please ignore previous email! Sorry for wasting your time.
>
> czw., 4 kwi 2019 o 14:36 Maciej Zdeb <[email protected]> napisał(a):
>
>> Hi,
>>
>> After haproxy starts everything is fine, but after some time one of
>> haproxy processes uses 100% of CPU. I've managed to dump some "show"
>> commands (I have also "show fd", "show sess all" and "strace" but if needed
>> I would prefer to send it privately). If you could give any advice howto
>> debug it further it would be great.
>>
>> ps aux | grep haprox
>> haproxy 99023 97.5 0.6 3139384 623476 ? Rs 14:07 16:38
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99024 13.0 0.6 3138964 623096 ? Ss 14:07 2:13
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99025 14.1 0.6 3139044 623160 ? Rs 14:07 2:25
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99034 11.9 0.6 3139024 623496 ? Ss 14:07 2:02
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99035 16.3 0.6 3139184 623116 ? Ss 14:07 2:47
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99037 13.0 0.6 3138724 623076 ? Ss 14:07 2:13
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99045 16.3 0.6 3139332 623216 ? Ss 14:07 2:47
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>> haproxy 99046 12.0 0.6 3137792 622208 ? Rs 14:07 2:03
>> /usr/sbin/haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D
>> -sf 15434 15443 15444 15445 15454 15455 15456 15457
>>
>> socat /var/run/haproxy/haproxy1.sock - <<< "show info"
>> Name: HAProxy
>> Version: 1.9.6
>> Release_date: 2019/03/29
>> Nbthread: 1
>> Nbproc: 8
>> Process_num: 1
>> Pid: 99023
>> Uptime: 0d 0h12m26s
>> Uptime_sec: 746
>> Memmax_MB: 0
>> PoolAlloc_MB: 8
>> PoolUsed_MB: 8
>> PoolFailed: 0
>> Ulimit-n: 1000000
>> Maxsock: 4012387
>> Maxconn: 2000000
>> Hard_maxconn: 2000000
>> CurrConns: 850
>> CumConns: 114038
>> CumReq: 179023
>> MaxSslConns: 0
>> CurrSslConns: 4
>> CumSslConns: 6225
>> Maxpipes: 0
>> PipesUsed: 0
>> PipesFree: 0
>> ConnRate: 161
>> ConnRateLimit: 0
>> MaxConnRate: 366
>> SessRate: 161
>> SessRateLimit: 0
>> MaxSessRate: 366
>> SslRate: 6
>> SslRateLimit: 0
>> MaxSslRate: 18
>> SslFrontendKeyRate: 3
>> SslFrontendMaxKeyRate: 7
>> SslFrontendSessionReuse_pct: 50
>> SslBackendKeyRate: 0
>> SslBackendMaxKeyRate: 10
>> SslCacheLookups: 1054
>> SslCacheMisses: 103
>> CompressBpsIn: 0
>> CompressBpsOut: 0
>> CompressBpsRateLim: 0
>> ZlibMemUsage: 0
>> MaxZlibMemUsage: 0
>> Tasks: 26283
>> Run_queue: 3
>> Idle_pct: 39
>> node: XXXX
>> Stopping: 0
>> Jobs: 7181
>> Unstoppable Jobs: 0
>> Listeners: 6330
>> ActivePeers: 0
>> ConnectedPeers: 0
>> DroppedLogs: 6
>> BusyPolling: 0
>>
>> socat /var/run/haproxy/haproxy1.sock - <<< "show activity"
>> thread_id: 0
>> date_now: 1554380334.734878
>> loops: 355064374
>> wake_cache: 2498224
>> wake_tasks: 352539647
>> wake_signal: 0
>> poll_exp: 355037871
>> poll_drop: 194560
>> poll_dead: 0
>> poll_skip: 0
>> fd_skip: 0
>> fd_lock: 0
>> fd_del: 0
>> conn_dead: 0
>> stream: 710586336
>> empty_rq: 15313
>> long_rq: 57
>> cpust_ms_tot: 747
>> cpust_ms_1s: 0
>> cpust_ms_15s: 0
>> avg_loop_us: 1
>>
>> haproxy -vv
>> HA-Proxy version 1.9.6 2019/03/29 - https://haproxy.org/
>> Build options :
>> TARGET = linux2628
>> CPU = generic
>> CC = gcc
>> CFLAGS = -O2 -g -fno-strict-aliasing -Wdeclaration-after-statement
>> -fwrapv -Wno-unused-label -Wno-sign-compare -Wno-unused-parameter
>> -Wno-old-style-declaration -Wno-ignored-qualifiers -Wno-clobbered
>> -Wno-missing-field-initializers -Wtype-limits -DIP_BIND_ADDRESS_NO_PORT=24
>> OPTIONS = USE_GETADDRINFO=1 USE_ZLIB=1 USE_DL=1 USE_OPENSSL=1 USE_LUA=1
>> USE_PCRE=1 USE_PCRE_JIT=1
>>
>> Default settings :
>> maxconn = 2000, bufsize = 16384, maxrewrite = 1024, maxpollevents = 200
>>
>> Built with OpenSSL version : OpenSSL 1.1.1b 26 Feb 2019
>> Running on OpenSSL version : OpenSSL 1.1.1b 26 Feb 2019
>> OpenSSL library supports TLS extensions : yes
>> OpenSSL library supports SNI : yes
>> OpenSSL library supports : TLSv1.0 TLSv1.1 TLSv1.2 TLSv1.3
>> Built with Lua version : Lua 5.3.5
>> Built with transparent proxy support using: IP_TRANSPARENT
>> IPV6_TRANSPARENT IP_FREEBIND
>> Built with zlib version : 1.2.8
>> Running on zlib version : 1.2.8
>> Compression algorithms supported : identity("identity"),
>> deflate("deflate"), raw-deflate("deflate"), gzip("gzip")
>> Built with PCRE version : 8.42 2018-03-20
>> Running on PCRE version : 8.42 2018-03-20
>> PCRE library supports JIT : yes
>> Encrypted password support via crypt(3): yes
>> Built with multi-threading support.
>>
>> Available polling systems :
>> epoll : pref=300, test result OK
>> poll : pref=200, test result OK
>> select : pref=150, test result OK
>> Total: 3 (3 usable), will use epoll.
>>
>> Available multiplexer protocols :
>> (protocols marked as <default> cannot be specified using 'proto' keyword)
>> h2 : mode=HTX side=FE|BE
>> h2 : mode=HTTP side=FE
>> <default> : mode=HTX side=FE|BE
>> <default> : mode=TCP|HTTP side=FE|BE
>>
>> Available filters :
>> [SPOE] spoe
>> [COMP] compression
>> [CACHE] cache
>> [TRACE] trace
>>
>