This is an automated email from the ASF dual-hosted git repository.
shreemaan-abhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new e32ed3e16 test(ci): fix flaky tests (#13332)
e32ed3e16 is described below
commit e32ed3e1661bf4ae7a863cc00b54b35195f8e678
Author: Shreemaan Abhishek <[email protected]>
AuthorDate: Thu May 7 10:40:49 2026 +0800
test(ci): fix flaky tests (#13332)
* test(ci): fix flaky tests (follow-up to #13266)
Fix four flakes that survived #13266, identified by re-running the
cross-branch flake analysis on CI failures from the past week:
- t/cli/test_etcd_sync_event_handle.sh "Round 2 Request 1 unexpected":
fixed `sleep 5` was too short on slow runners — etcd auth-toggle +
watch-reconnect + bulk-event-apply can take longer than 5s, so curl
hits the stale route. Replaced with a 30s deadline poll on /1 until
status 204 (the new fault-injection plugin) appears.
- t/core/config_etcd.t TEST 10 "timeout when waiting for the process to
exit": the test calls test_sync_data which fires init_watch_ctx, leaving
a run_watch background timer alive; on slow runners nginx shutdown
exceeds the default 3s kill-wait. Bumped TEST_NGINX_TIMEOUT to 30 for
the file.
- t/admin/plugins-reload.t TEST 1, TEST 2 "grep_error_log_out empty":
the post-reload "sync local conf to etcd" log doesn't always land
within ngx.sleep(1). Bumped to ngx.sleep(2).
- t/discovery/eureka.t TEST 4 "failed to fetch registry from
127.0.0.1:20997 should match": added `--- wait: 2` so the eureka
fetch interval (1s) has a chance to fire before the grep runs.
* test(plugins-reload): bump initial-filter-fire wait in TEST 2
The previous bump was on the wrong sleep. The flake symptom is the
"reload plugins on node before reload" log line missing from the
grep_error_log_out diff — i.e. the initial filter fire (which logs
"before reload" while before_reload=true) didn't land before the test
flipped before_reload=false. The fix is to bump the sleep BETWEEN
core.config.new and the before_reload=false flip, not the final sleep
after the reload PUT.
* test(plugins-reload): bump per-test timeout to 10s
The bumped ngx.sleep calls in TEST 1/2 push handler time over the
default 3s socket timeout from Test::Nginx::Util::$Timeout, causing
prove to fail with "ERROR: client socket timed out". Add `--- timeout:
10` to both tests so the test client waits long enough for the
extended sleep windows to complete.
* test(radixtree-sni3): bump TEST_NGINX_TIMEOUT to 30s
Same fix shape as t/core/config_etcd.t in this PR — tests that exercise
etcd-watcher background timers can hit Test::Nginx's default 3s
process-exit kill-wait on slow runners. The EE counterpart of this file
has tripped this in CI; the same race exists in apisix and is a free
pre-emptive fix (no test slowdown — env var only affects shutdown).
* test(skywalking2): bump TEST 2 request count from 12 to 50
The test runs with workers(4) and the patched startBackendTimer only
logs "start skywalking backend timer" when ngx.worker.id()==0. With 12
keepalive=false connections and 4 workers, the ~3% probability that no
connection lands on worker 0 produced a recurring flake — observed on
master + 3 unrelated branches in the past 8 days. Bumping to 50 drops
the miss probability to (3/4)^50 ≈ 0.0006%. No semantic change.
* test(skywalking2): bump TEST 2 per-test timeout to 10s
The previous commit bumped the request loop from 12 to 50, but 50
sequential keepalive=false round-trips on CI runners can take longer
than the default 3s test-client socket timeout, causing
"ERROR: client socket timed out". Add `--- timeout: 10` so the test
client waits long enough for the extended handler to complete.
---
t/admin/plugins-reload.t | 12 +++++++++---
t/cli/test_etcd_sync_event_handle.sh | 13 ++++++++++++-
t/core/config_etcd.t | 4 ++++
t/discovery/eureka.t | 1 +
t/plugin/skywalking2.t | 8 +++++++-
t/router/radixtree-sni3.t | 5 +++++
6 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/t/admin/plugins-reload.t b/t/admin/plugins-reload.t
index a0f4ba7df..c08b9f729 100644
--- a/t/admin/plugins-reload.t
+++ b/t/admin/plugins-reload.t
@@ -45,13 +45,16 @@ location /t {
ngx.status = code
ngx.say(org_body)
- ngx.sleep(1)
+ -- 1s was racy on slow runners; the post-reload sync log doesn't always
+ -- land in time for grep_error_log_out.
+ ngx.sleep(2)
}
}
--- request
GET /t
--- response_body
done
+--- timeout: 10
--- grep_error_log eval
qr/sync local conf to etcd/
--- grep_error_log_out
@@ -88,7 +91,9 @@ location /t {
error("failed to create etcd instance for fetching /plugins : "
.. err)
end
- ngx.sleep(1)
+ -- Wait for the initial filter fire (logs "before reload") to land
+ -- before flipping before_reload=false. 1s was racy on slow runners.
+ ngx.sleep(2)
local data = [[
deployment:
@@ -113,13 +118,14 @@ stream_plugins:
ngx.status = code
ngx.say(org_body)
- ngx.sleep(1)
+ ngx.sleep(2)
}
}
--- request
GET /t
--- response_body
done
+--- timeout: 10
--- grep_error_log eval
qr/reload plugins on node \w+ reload/
--- grep_error_log_out
diff --git a/t/cli/test_etcd_sync_event_handle.sh
b/t/cli/test_etcd_sync_event_handle.sh
index a348b52b7..2e608f4b0 100755
--- a/t/cli/test_etcd_sync_event_handle.sh
+++ b/t/cli/test_etcd_sync_event_handle.sh
@@ -89,7 +89,18 @@ etcdctl --endpoints=127.0.0.1:2379
--user=root:apache-api6-sync put /apisix/rout
etcdctl --endpoints=127.0.0.1:2379 --user=root:apache-api6-sync auth disable
etcdctl --endpoints=127.0.0.1:2379 user delete root
etcdctl --endpoints=127.0.0.1:2379 role delete root
-sleep 5 # wait resync by watch
+# Wait for resync by watch. The fixed `sleep 5` was racy on slow runners — the
etcd
+# auth-toggle + watch-reconnect + bulk-event-apply can take longer, so poll
route /1
+# until the fault-injection plugin is applied (status 204).
+deadline=$(( $(date +%s) + 30 ))
+{ set +x; } 2>/dev/null
+while [ "$(date +%s)" -lt "$deadline" ]; do
+ if [ "$(curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:9080/1)" =
"204" ]; then
+ break
+ fi
+ sleep 0.5
+done
+set -x
# Test request
# All but the intentionally incoming misconfigurations should be applied,
diff --git a/t/core/config_etcd.t b/t/core/config_etcd.t
index 918bdd8ce..ead9112aa 100644
--- a/t/core/config_etcd.t
+++ b/t/core/config_etcd.t
@@ -14,6 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+# TEST 10 leaves a `run_watch` background timer alive (via init_watch_ctx), so
+# nginx shutdown takes longer than the default 3s kill-wait. Bump for the file.
+BEGIN { $ENV{TEST_NGINX_TIMEOUT} = 30; }
+
use t::APISIX 'no_plan';
repeat_each(1);
diff --git a/t/discovery/eureka.t b/t/discovery/eureka.t
index f87272675..7bee81142 100644
--- a/t/discovery/eureka.t
+++ b/t/discovery/eureka.t
@@ -165,6 +165,7 @@ successfully updated service registry
--- no_error_log
failed to fetch registry from all eureka hosts
failed to fetch registry from http://127.0.0.1:8761/eureka/
+--- wait: 2
diff --git a/t/plugin/skywalking2.t b/t/plugin/skywalking2.t
index 4d7f3908d..80748ada3 100644
--- a/t/plugin/skywalking2.t
+++ b/t/plugin/skywalking2.t
@@ -150,7 +150,12 @@ passed
local uri = "http://127.0.0.1:" .. ngx.var.server_port
.. "/opentracing"
local ports_count = {}
- for i = 1, 12 do
+ -- The patched startBackendTimer only logs when ngx.worker.id()==0,
+ -- so we need at least one connection to land on worker 0. With
+ -- workers(4) and keepalive=false, 12 requests gave a ~3% chance
+ -- of missing worker 0 entirely; bump to 50 to make that
vanishingly
+ -- unlikely (~0.0006%).
+ for i = 1, 50 do
local httpc = http.new()
local res, err = httpc:request_uri(uri, {method = "GET",
keepalive = false})
if not res then
@@ -170,6 +175,7 @@ passed
GET /t
--- response_body
passed
+--- timeout: 10
--- grep_error_log eval
qr/start skywalking backend timer/
--- grep_error_log_out
diff --git a/t/router/radixtree-sni3.t b/t/router/radixtree-sni3.t
index d7cd40de6..88210c9ec 100644
--- a/t/router/radixtree-sni3.t
+++ b/t/router/radixtree-sni3.t
@@ -14,6 +14,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+# TEST 7 occasionally hits Test::Nginx's process-exit timeout (default 3s) on
+# slow runners. Bump for the file. (See EE counterpart: TEST 10 has tripped
this
+# in EE CI; the same race exists here in apisix.)
+BEGIN { $ENV{TEST_NGINX_TIMEOUT} = 30; }
+
use t::APISIX 'no_plan';
log_level('debug');