alptugay commented on issue #9775:
URL: https://github.com/apache/apisix/issues/9775#issuecomment-1677392307

   We have encountered the issue once again. Our current findings are:
   
   1) 4 out of 6 Apisix instances in the same environment were affected in 
**different times** in the **same day**. These are identical machines running 
with the same configs. 
   
   APISIX INSTANCE 1
   ```
   2023/08/08 15:30:09 [error] 480719#480719: *24417911412 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.92:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:30:10 [error] 480724#480724: *24417921777 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.92:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:30:11 [error] 480739#480739: *24417932673 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.92:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:30:12 [error] 480716#480716: *24417943934 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.92:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:38:39 [error] 480728#480728: *24422048257 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:32708':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:38:40 [error] 480735#480735: *24422057094 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:32708':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:38:41 [error] 480723#480723: *24422065842 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:32708':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:38:42 [error] 480733#480733: *24422074126 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:32708':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 15:38:43 [error] 480736#480736: *24422083755 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:32708':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   
   ```
   APISIX INSTANCE 2
   ```
   2023/08/08 14:23:39 [error] 608073#608073: *24408524288 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 14:23:41 [error] 608090#608090: *24408534735 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 14:23:42 [error] 608078#608078: *24408545713 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 14:23:43 [error] 608087#608087: *24408556591 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.194:31125':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   
   ```
   APISIX INSTANCE 3
   ```
   2023/08/08 16:29:39 [error] 67061#67061: *27466062548 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.94:31725':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 16:29:40 [error] 67045#67045: *27466073872 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.94:31725':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 16:29:41 [error] 67063#67063: *27466084337 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.94:31725':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 16:29:42 [error] 67041#67041: *27466094834 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313:target_lock:XXX.XXX.XXX.94:31725':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.193, server: 0.0.0.0:80
   2023/08/08 16:34:33 [error] 67043#67043: *27468414132 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.91:30238':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.191, server: 0.0.0.0:80
   2023/08/08 16:34:34 [error] 67050#67050: *27468422768 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.91:30238':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.94, server: 0.0.0.0:80
   2023/08/08 16:34:35 [error] 67051#67051: *27468432136 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.91:30238':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.94, server: 0.0.0.0:80
   2023/08/08 16:34:36 [error] 67042#67042: *27468440315 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.91:30238':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 17:13:03 [error] 291549#291549: *27482520502 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.92:30057':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.196, server: 0.0.0.0:80
   2023/08/08 17:13:04 [error] 67057#67057: *27482527129 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.92:30057':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.198, server: 0.0.0.0:80
   2023/08/08 17:13:05 [error] 67046#67046: *27482533105 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.92:30057':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 17:13:06 [error] 291659#291659: *27482539195 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.92:30057':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.198, server: 0.0.0.0:80
   2023/08/08 17:22:33 [error] 291921#291921: *27485105465 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: 10.212.200.129, server: 0.0.0.0:443
   2023/08/08 17:22:34 [error] 67058#67058: *27485109964 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 17:22:36 [error] 291582#291582: *27485115892 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.96, server: 0.0.0.0:80
   2023/08/08 17:22:37 [error] 67072#67072: *27485120591 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.198, server: 0.0.0.0:80
   
   ```
   APISIX INSTANCE 4
   ```
   2023/08/08 15:54:03 [error] 65587#65587: *27451715285 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.93:32496':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 15:54:04 [error] 65602#65602: *27451726740 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.93:32496':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 15:54:05 [error] 65606#65606: *27451738230 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.93:32496':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 15:54:06 [error] 65596#65596: *27451750351 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.93:32496':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.191, server: 0.0.0.0:80
   ```
   
   2) As it can be seen in the logs the errors were related to 2 upstreams 
`58e0a3b27292c0d9267ce78a7dfa45ab` and `f51cbfa1fac84b2110ba93f947c93313`
   
   3) We have found that event though the backend of these upstreams were 
serving through HTTP, we have mistakenly set the scheme of the upstreams as 
HTTPS
   
   4) Also the are duplicate IP-Port pairs in these upstream definitions. Below 
you can see the content of these upstreams:
   
   For example:
   ```
   "port": 32330,
   "host": "XXX.XXX.XXX.30"
   ```
   
   Upstream: 58e0a3b27292c0d9267ce78a7dfa45ab
   ```
       {
         "value": {
           "type": "roundrobin",
           "create_time": 1684840040,
           "timeout": {
             "connect": 300,
             "read": 300,
             "send": 300
           },
           "name": "thanos-query-store-api",
           "id": "58e0a3b27292c0d9267ce78a7dfa45ab",
           "hash_on": "vars",
           "desc": "source:consul",
           "pass_host": "pass",
           "update_time": 1691489116,
           "labels": {
             "source": "consul"
           },
           "scheme": "https",
           "nodes": [
             {
               "priority": 0,
               "port": 32347,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 32337,
               "weight": 1,
               "host": "XXX.XXX.XXX.29"
             },
             {
               "priority": 0,
               "port": 32337,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             },
             {
               "priority": 0,
               "port": 32330,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 32496,
               "weight": 1,
               "host": "XXX.XXX.XXX.93"
             },
             {
               "priority": 0,
               "port": 32347,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             },
             {
               "priority": 0,
               "port": 31141,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 32496,
               "weight": 1,
               "host": "XXX.XXX.XXX.198"
             },
             {
               "priority": 0,
               "port": 32190,
               "weight": 1,
               "host": "XXX.XXX.XXX.195"
             },
             {
               "priority": 0,
               "port": 30057,
               "weight": 1,
               "host": "XXX.XXX.XXX.92"
             },
             {
               "priority": 0,
               "port": 30238,
               "weight": 1,
               "host": "XXX.XXX.XXX.91"
             },
             {
               "priority": 0,
               "port": 31141,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 32330,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             },
             {
               "priority": 0,
               "port": 32190,
               "weight": 1,
               "host": "XXX.XXX.XXX.95"
             },
             {
               "priority": 0,
               "port": 32330,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 30057,
               "weight": 1,
               "host": "XXX.XXX.XXX.92"
             },
             {
               "priority": 0,
               "port": 32330,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 32330,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             },
             {
               "priority": 0,
               "port": 30238,
               "weight": 1,
               "host": "XXX.XXX.XXX.191"
             }
           ],
           "checks": {
             "active": {
               "unhealthy": {
                 "timeouts": 3,
                 "http_statuses": [
                   429,
                   404,
                   500,
                   501,
                   502,
                   503,
                   504,
                   505
                 ],
                 "interval": 1,
                 "http_failures": 5,
                 "tcp_failures": 2
               },
               "type": "tcp",
               "http_path": "/",
               "healthy": {
                 "successes": 2,
                 "http_statuses": [
                   200,
                   302
                 ],
                 "interval": 1
               },
               "timeout": 1,
               "concurrency": 10,
               "https_verify_certificate": true
             }
           }
         },
         "modifiedIndex": 1695071,
         "createdIndex": 1165,
         "key": "/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab"
       }
   ```
   Upstream: f51cbfa1fac84b2110ba93f947c93313
   
   ```
   {
         "value": {
           "type": "roundrobin",
           "create_time": 1684840040,
           "timeout": {
             "connect": 300,
             "read": 300,
             "send": 300
           },
           "nodes": [
             {
               "priority": 0,
               "port": 30249,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 31725,
               "weight": 1,
               "host": "XXX.XXX.XXX.94"
             },
             {
               "priority": 0,
               "port": 31914,
               "weight": 1,
               "host": "XXX.XXX.XXX.96"
             },
             {
               "priority": 0,
               "port": 31125,
               "weight": 1,
               "host": "XXX.XXX.XXX.194"
             },
             {
               "priority": 0,
               "port": 31569,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 31500,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 30593,
               "weight": 1,
               "host": "XXX.XXX.XXX.30"
             },
             {
               "priority": 0,
               "port": 32708,
               "weight": 1,
               "host": "XXX.XXX.XXX.194"
             },
             {
               "priority": 0,
               "port": 31125,
               "weight": 1,
               "host": "XXX.XXX.XXX.92"
             },
             {
               "priority": 0,
               "port": 30593,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 31725,
               "weight": 1,
               "host": "XXX.XXX.XXX.93"
             },
             {
               "priority": 0,
               "port": 31569,
               "weight": 1,
               "host": "XXX.XXX.XXX.33"
             },
             {
               "priority": 0,
               "port": 31914,
               "weight": 1,
               "host": "XXX.XXX.XXX.94"
             },
             {
               "priority": 0,
               "port": 32708,
               "weight": 1,
               "host": "XXX.XXX.XXX.95"
             },
             {
               "priority": 0,
               "port": 31500,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             },
             {
               "priority": 0,
               "port": 30249,
               "weight": 1,
               "host": "XXX.XXX.XXX.31"
             }
           ],
           "pass_host": "pass",
           "hash_on": "vars",
           "desc": "source:consul",
           "id": "f51cbfa1fac84b2110ba93f947c93313",
           "update_time": 1689321033,
           "labels": {
             "source": "consul"
           },
           "scheme": "https",
           "name": "thanos-queryfrontend-api",
           "checks": {
             "active": {
               "unhealthy": {
                 "timeouts": 3,
                 "interval": 1,
                 "http_statuses": [
                   429,
                   404,
                   500,
                   501,
                   502,
                   503,
                   504,
                   505
                 ],
                 "http_failures": 5,
                 "tcp_failures": 2
               },
               "type": "tcp",
               "http_path": "/",
               "healthy": {
                 "successes": 2,
                 "http_statuses": [
                   200,
                   302
                 ],
                 "interval": 1
               },
               "timeout": 1,
               "concurrency": 10,
               "https_verify_certificate": true
             }
           }
         },
         "modifiedIndex": 1239801,
         "createdIndex": 1181,
         "key": "/apisix/upstreams/f51cbfa1fac84b2110ba93f947c93313"
       }
   ```
   5) There is only one entry in the output of `ss -t` that contains an entry 
regarding to the upstream 58e0a3b27292c0d9267ce78a7dfa45ab
   
   `2023/08/08 17:22:37 [error] 67072#67072: *27485120591 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.198, server: 0.0.0.0:80`
   
   `SYN-SENT   0       1               YYY.YYY.YYY.YYY:42790            
XXX.XXX.XXX.29:32337`
   
   XXX.XXX.XXX.29:32337 is one of the upstreams of 
58e0a3b27292c0d9267ce78a7dfa45ab
   
   
   6) I am posting some logs just before we receive failed to release lock logs:
   
   You can ignore `worker process **** exited on signal 9` logs. Because we 
have a script that kills the workers as soon as it detects "failed to release 
lock" logs.
   
   
   ```
   2023/08/08 17:22:27 [error] 291898#291898: *27485100457 [lua] 
events.lua:273: post(): worker-events: failed posting event "mostly_healthy" by 
"lua-resty-healthcheck 
[upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab]"; no memory while 
connect
   ing to upstream, client: XXX.XXX.XXX.191, server: _, request: "GET / 
HTTP/1.1", upstream: "https://XXX.XXX.XXX.91:30238/";, host: 
"thanos-query-store-api.AAAA.BBBB.com"
   2023/08/08 17:22:27 [error] 291898#291898: *27485100457 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.95:32190/";, host: 
"thanos-query-store-api.AAAA.BBBB.com"
   2023/08/08 17:22:28 [error] 291695#291695: *27485100523 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291632#291632: *27485100556 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 67058#67058: *27485100578 [lua] events.lua:364: 
poll(): worker-events: dropping event; waiting for event data timed out, id: 
10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291940#291940: *27485100654 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 67044#67044: *27485100658 [lua] events.lua:364: 
poll(): worker-events: dropping event; waiting for event data timed out, id: 
10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291520#291520: *27485100669 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291921#291921: *27485100780 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291682#291682: *27485100840 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291582#291582: *27485100855 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 67077#67077: *27485100854 [lua] events.lua:364: 
poll(): worker-events: dropping event; waiting for event data timed out, id: 
10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 291877#291877: *27485100859 [lua] 
events.lua:364: poll(): worker-events: dropping event; waiting for event data 
timed out, id: 10019295, context: ngx.timer
   2023/08/08 17:22:28 [error] 67072#67072: *27485100860 [lua] events.lua:364: 
poll(): worker-events: dropping event; waiting for event data timed out, id: 
10019295, context: ngx.timer
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.94:31725/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.93:31725/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.194:31125/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.194:32708/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.92:31125/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.96:31914/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.94:31914/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 SSL_do_handshake() 
failed (SSL: error:1408F10B:SSL routines:ssl3_get_record:wrong version number) 
while SSL handshaking to upstream, client: XXX.XXX.XXX.191, server: _, request: 
"GET / HTTP/1.1", upstream: "https://XXX.XXX.XXX.95:32708/";, host: 
"thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291520#291520: *27485126279 [lua] 
balancer.lua:362: run(): failed to pick server: failed to find valid upstream 
server, all upstream servers tried while connecting to upstream, client: 
XXX.XXX.XXX.191, server: _, request: "GET / HTTP/1.1", upstream: 
"https://XXX.XXX.XXX.95:32708/";, host: "thanos-queryfrontend-api.AAAA.BBBB.com"
   2023/08/08 17:22:33 [error] 291921#291921: *27485105465 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: YYY.YYY.YYY.129, server: 0.0.0.0:443
   2023/08/08 17:22:34 [alert] 27489#27489: worker process 291921 exited on 
signal 9
   2023/08/08 17:22:34 [error] 67058#67058: *27485109964 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.197, server: 0.0.0.0:80
   2023/08/08 17:22:35 [alert] 27489#27489: worker process 67058 exited on 
signal 9
   2023/08/08 17:22:36 [error] 291582#291582: *27485115892 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.96, server: 0.0.0.0:80
   2023/08/08 17:22:37 [alert] 27489#27489: worker process 291582 exited on 
signal 9
   2023/08/08 17:22:37 [error] 67072#67072: *27485120591 [lua] 
healthcheck.lua:1150: log(): [healthcheck] 
(upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab) failed to release 
lock 
'lua-resty-healthcheck:upstream#/apisix/upstreams/58e0a3b27292c0d9267ce78a7dfa45ab:target_lock:XXX.XXX.XXX.95:32190':
 unlocked, context: ngx.timer, client: XXX.XXX.XXX.198, server: 0.0.0.0:80
   2023/08/08 17:22:38 [alert] 27489#27489: worker process 67072 exited on 
signal 9
   ```
   I think the SSL errors are related to our finding **number 3**
   worker-events dict size is: 1024m
   
   7) We also have an interesting finding.
   
   This is a normal log (after we kill the affected workers) for the related 
service, it is normal that all upstreams are 502 because of our finding 
**number 3**
   
   `{"@timestamp":"2023-08-08T17:56:03.484664+03:00", "_team":"traffic", 
"_app":"loadbalancer", "_federation":"AAA", "_region":"BBB", 
"_environment":"prod", "_cluster":"internal", "_hostname":"Apisix01", 
"_service":"loadbalancer", "tag":"lb_access_non_2xx3xx:", "severity":"6", 
"facility":"23", "severity_label":"info", "facility_label":"local7", 
"cf_ipcountry":"", "http_x_client_ip":"", "http_True_Client_IP":"", 
"upstream_http_X_Proxy_Cache":"", 
"request_id":"5364e8127a236e359699bffe80851093", "route_id":"", 
"request_length":"120", "remote_addr":"XXX.XXX.XXX.196", "remote_port":"10506", 
"request":"GET / HTTP/1.1", "args":"", "uri":"/", "status":"502", 
"bytes_sent":"754", "http_user_agent":"Go-http-client/1.1", 
"http_x_forwarded_for":"", 
"http_host":"thanos-queryfrontend-api.aaaa.bbbb.com", "server_name":"_", 
"request_time":"0.012", "upstream":"XXX.XXX.XXX.95:32708,  
XXX.XXX.XXX.31:31500,  XXX.XXX.XXX.31:30249,  XXX.XXX.XXX.33:30249,  
XXX.XXX.XXX.30:31569,  XXX.XXX.XXX.33:31500,  XXX.X
 XX.XXX.30:30593,  XXX.XXX.XXX.33:30593,  XXX.XXX.XXX.33:31569,  
XXX.XXX.XXX.94:31725,  XXX.XXX.XXX.96:31914,  XXX.XXX.XXX.194:31125,  
XXX.XXX.XXX.194:32708,  XXX.XXX.XXX.92:31125,  XXX.XXX.XXX.93:31725,  
XXX.XXX.XXX.94:31914", "upstream_connect_time":"-,  -,  -,  -,  -,  -,  -,  -,  
-,  -,  -,  -,  -,  -,  -,  -", "upstream_status":"502,  502,  502,  502,  502, 
 502,  502,  502,  502,  502,  502,  502,  502,  502,  502,  502", 
"upstream_response_time":"0.002,  0.000,  0.000,  0.001,  0.000,  0.000,  
0.001,  0.000,  0.000,  0.002,  0.000,  0.001,  0.001,  0.001,  0.001,  0.001", 
"upstream_cache_status":"", "ssl_protocol":"", "ssl_cipher":"", 
"scheme":"http", "server_port":"80", "request_method":"GET", 
"server_protocol":"HTTP/1.1", "http_cf_ray":"", "ty_lb_waf_id":"", 
"ty_lb_cc":"", "ty_lb_asn":""}`
   
   But this is an unusual log, and the logs related to this service during the 
problem is like the log below:
   
   `{"@timestamp":"2023-08-08T17:37:03.482507+03:00", "_team":"traffic", 
"_app":"loadbalancer", "_federation":"AAA", "_region":"BBB", 
"_environment":"prod", "_cluster":"internal", "_hostname":"Apisix01", 
"_service":"loadbalancer", "tag":"lb_access_non_2xx3xx:", "severity":"6", 
"facility":"23", "severity_label":"info", "facility_label":"local7", 
"cf_ipcountry":"", "http_x_client_ip":"", "http_True_Client_IP":"", 
"upstream_http_X_Proxy_Cache":"", 
"request_id":"ef16786dd332690d1fd06228bb08788c", "route_id":"", 
"request_length":"120", "remote_addr":"XXX.XXX.XXX.94", "remote_port":"61553", 
"request":"GET / HTTP/1.1", "args":"", "uri":"/", "status":"502", 
"bytes_sent":"680", "http_user_agent":"Go-http-client/1.1", 
"http_x_forwarded_for":"", 
"http_host":"thanos-queryfrontend-api.aaaa.bbbb.com", "server_name":"_", 
"request_time":"0.014", "upstream":"XXX.XXX.XXX.96:31914,  
XXX.XXX.XXX.194:31125,  XXX.XXX.XXX.194:32708,  XXX.XXX.XXX.92:31125,  
XXX.XXX.XXX.93:31725,  XXX.XXX.XXX.94:31914,  XXX.
 XXX.XXX.95:32708,  XXX.XXX.XXX.94:31725 : ", "upstream_connect_time":"-,  -,  
-,  -,  -,  -,  -,  - : ", "upstream_status":"502,  502,  502,  502,  502,  
502,  502,  502 : ", "upstream_response_time":"0.002,  0.001,  0.005,  0.001,  
0.001,  0.001,  0.002,  0.001 : ", "upstream_cache_status":"", 
"ssl_protocol":"", "ssl_cipher":"", "scheme":"http", "server_port":"80", 
"request_method":"GET", "server_protocol":"HTTP/1.1", "http_cf_ray":"", 
"ty_lb_waf_id":"", "ty_lb_cc":"", "ty_lb_asn":""}`
   
   Please look at the **upstream, upstream_status, upstream_connect_time, 
upstream_response_time**. They are like incomplete. They all and with colon 
character.` :`
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to