nic-6443 commented on code in PR #12261:
URL: https://github.com/apache/apisix/pull/12261#discussion_r3393123830
##########
apisix/balancer/least_conn.lua:
##########
@@ -109,5 +258,67 @@ function _M.new(up_nodes, upstream)
}
end
+local function cleanup_all_conn_counts()
+ if not conn_count_dict then
+ conn_count_dict = ngx_shared[CONN_COUNT_DICT_NAME]
+ end
+
+ if not conn_count_dict then
+ -- No shared dict available, nothing to cleanup
+ return
+ end
+
+ -- Use batched cleanup to avoid performance issues with large dictionaries
+ -- Process keys in batches to limit memory usage and execution time
+ local batch_size = 100 -- Process 100 keys at a time
+ local total_cleaned = 0
+ local processed_count = 0
+ local has_more = true
+
+ while has_more do
+ local keys, err = conn_count_dict:get_keys(batch_size)
+ if err then
+ core.log.error("failed to get keys from shared dict during
cleanup: ", err)
+ return
+ end
+
+ if not keys or #keys == 0 then
+ break
+ end
+
+ local batch_cleaned = 0
+ for _, key in ipairs(keys) do
+ processed_count = processed_count + 1
+ if core.string.has_prefix(key, "conn_count:") then
+ local ok, delete_err = conn_count_dict:delete(key)
+ if not ok and delete_err then
+ core.log.warn("failed to delete connection count key
during cleanup: ",
+ key, ", error: ", delete_err)
+ else
+ batch_cleaned = batch_cleaned + 1
+ total_cleaned = total_cleaned + 1
+ end
+ end
+ end
+
+ -- Yield control periodically to avoid blocking
+ if processed_count % 1000 == 0 then
+ ngx.sleep(0.001) -- 1ms yield
+ end
+
+ -- If we got fewer keys than batch_size, we're done
+ if #keys < batch_size then
+ has_more = false
+ end
+ end
+
+ if total_cleaned > 0 then
+ core.log.info("cleaned up ", total_cleaned, " connection count entries
from shared dict")
+ end
+end
+
+function _M.cleanup_all()
Review Comment:
`cleanup_all()` is never called from anywhere — neither this PR nor existing
code wires it up, so the "global cleanup" escape hatch referenced in the
comments and docs does not exist in practice. That makes missed decrements
permanent: if a worker crashes with in-flight connections, the counts for those
still-live servers stay inflated forever — `cleanup_stale_conn_counts` only
deletes entries that are exactly 0, and the shared dict survives `apisix
reload`. A node that leaks +N gets deprioritized indefinitely until a full
restart wipes the dict. I think the feature needs a real reconciliation story
(e.g. scope keys by a generation/epoch invalidated on init_worker, or
periodically rebuild counts from actual state) rather than trusting raw
counters to stay accurate forever.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]