hnlq715 commented on a change in pull request #3300: URL: https://github.com/apache/apisix/pull/3300#discussion_r559884772
########## File path: apisix/balancer/ewma.lua ########## @@ -3,105 +3,125 @@ -- Inspiration drawn from: -- https://github.com/twitter/finagle/blob/1bc837c4feafc0096e43c0e98516a8e1c50c4421 -- /finagle-core/src/main/scala/com/twitter/finagle/loadbalancer/PeakEwma.scala - local core = require("apisix.core") +local nkeys = require("core.table.nkeys") +local resty_lock = require("resty.lock") + local ngx = ngx local ngx_shared = ngx.shared local ngx_now = ngx.now local math = math local pairs = pairs local next = next +local error = error -local _M = {} local DECAY_TIME = 10 -- this value is in seconds +local LOCK_KEY = ":ewma_key" local shm_ewma = ngx_shared.balancer_ewma -local shm_last_touched_at= ngx_shared.balancer_ewma_last_touched_at +local shm_last_touched_at = ngx_shared.balancer_ewma_last_touched_at + +local lrucache_addr = core.lrucache.new({ttl = 300, count = 1024}) +local lrucache_trans_format = core.lrucache.new({ttl = 300, count = 256}) + +local ewma_lock, ewma_lock_err = resty_lock:new("balancer_ewma_locks", + {timeout = 0, exptime = 0.1}) + +local _M = {name = "ewma"} + +local function lock(upstream) + local _, err = ewma_lock:lock(upstream .. LOCK_KEY) + if err then + if err ~= "timeout" then + core.log.error("EWMA Balancer failed to lock: ", err) + end + end + + return err +end -local lrucache_addr = core.lrucache.new({ - ttl = 300, count = 1024 -}) -local lrucache_trans_format = core.lrucache.new({ - ttl = 300, count = 256 -}) +local function unlock() + local ok, err = ewma_lock:unlock() + if not ok then + core.log.error("EWMA Balancer failed to unlock: ", err) + end + return err +end local function decay_ewma(ewma, last_touched_at, rtt, now) local td = now - last_touched_at - td = math.max(td, 0) + td = (td > 0) and td or 0 local weight = math.exp(-td / DECAY_TIME) ewma = ewma * weight + rtt * (1.0 - weight) return ewma end - local function store_stats(upstream, ewma, now) local success, err, forcible = shm_last_touched_at:set(upstream, now) if not success then - core.log.error("balancer_ewma_last_touched_at:set failed ", err) + core.log.warn("shm_last_touched_at:set failed: ", err) end if forcible then - core.log.warn("balancer_ewma_last_touched_at:set valid items forcibly overwritten") + core.log + .warn("shm_last_touched_at:set valid items forcibly overwritten") end success, err, forcible = shm_ewma:set(upstream, ewma) if not success then - core.log.error("balancer_ewma:set failed ", err) + core.log.warn("shm_ewma:set failed: ", err) end if forcible then - core.log.warn("balancer_ewma:set valid items forcibly overwritten") + core.log.warn("shm_ewma:set valid items forcibly overwritten") end end - local function get_or_update_ewma(upstream, rtt, update) + local lock_err = nil + if update then Review comment: > Not locking for `get` without update will result in potentially incorrect behaviour. Imagine at line 86 you fetch current `ewma` value, then another worker updates it and its `last_touched_at` value before you retrieve `last_touched_at` at line 92. Then you will end up treating the old ewma as a new one. I'm not sure if in practice it would make big difference, but it is definitely not a correct behaviour. @ElvinEfendi You're right, this is not a correct behavior, but maybe a proper solution. In this situation you mentioned, `last_touched_at` value should have little difference (almost the same time), and we do not need to lock all `get` operation, which is quite heavy. > I think the only way of getting rid of locking for "get" operations is to combine ewma and timestamp in the same value and store under single key. But then you would need to do encoding and decoding every time you set and fetch it. It can be interesting to try that and see the performance. My first implementation is using worker process cache to store this, and we can simply avoid locking, without frequently encoding and decoding, which have a better performance. But we still have other concerns, details can be found in #3211 And with shared memory, we have to sacrifice between performance and correctness. I did some benchmark between these two situation: lock get and update --- ``` + wrk -d 5 -c 16 http://127.0.0.1:9080/hello Running 5s test @ http://127.0.0.1:9080/hello 2 threads and 16 connections Thread Stats Avg Stdev Max +/- Stdev Latency 1.62ms 282.29us 7.64ms 90.45% Req/Sec 4.98k 511.87 9.62k 97.03% 50013 requests in 5.10s, 199.42MB read Requests/sec: 9806.24 Transfer/sec: 39.10MB + sleep 1 + wrk -d 5 -c 16 http://127.0.0.1:9080/hello Running 5s test @ http://127.0.0.1:9080/hello 2 threads and 16 connections Thread Stats Avg Stdev Max +/- Stdev Latency 1.69ms 288.94us 7.25ms 89.40% Req/Sec 4.77k 285.81 5.28k 63.73% 48370 requests in 5.10s, 192.87MB read Requests/sec: 9484.82 Transfer/sec: 37.82MB ``` lock update --- ``` + wrk -d 5 -c 16 http://127.0.0.1:9080/hello Running 5s test @ http://127.0.0.1:9080/hello 2 threads and 16 connections Thread Stats Avg Stdev Max +/- Stdev Latency 1.57ms 289.91us 7.23ms 89.61% Req/Sec 5.14k 584.14 10.43k 96.04% 51652 requests in 5.10s, 205.95MB read Requests/sec: 10128.09 Transfer/sec: 40.38MB + sleep 1 + wrk -d 5 -c 16 http://127.0.0.1:9080/hello Running 5s test @ http://127.0.0.1:9080/hello 2 threads and 16 connections Thread Stats Avg Stdev Max +/- Stdev Latency 1.55ms 255.99us 6.55ms 89.96% Req/Sec 5.18k 539.62 9.77k 95.05% 52008 requests in 5.10s, 207.37MB read Requests/sec: 10198.48 Transfer/sec: 40.66MB ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
