This is an automated email from the ASF dual-hosted git repository.
AlinsRan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new 100736686 feat(ai-proxy-multi): add max_retries and
retry_on_failure_within_ms for fallback (#13495)
100736686 is described below
commit 100736686bde281a72b4c0fc9812b737fde5149f
Author: AlinsRan <[email protected]>
AuthorDate: Wed Jun 10 11:11:55 2026 +0800
feat(ai-proxy-multi): add max_retries and retry_on_failure_within_ms for
fallback (#13495)
---
apisix/plugins/ai-proxy-multi.lua | 28 ++++
apisix/plugins/ai-proxy/schema.lua | 21 +++
docs/en/latest/plugins/ai-proxy-multi.md | 2 +
docs/zh/latest/plugins/ai-proxy-multi.md | 2 +
t/plugin/ai-proxy-multi-retry.t | 216 +++++++++++++++++++++++++++++++
5 files changed, 269 insertions(+)
diff --git a/apisix/plugins/ai-proxy-multi.lua
b/apisix/plugins/ai-proxy-multi.lua
index 3c6ac9330..feffa77c5 100644
--- a/apisix/plugins/ai-proxy-multi.lua
+++ b/apisix/plugins/ai-proxy-multi.lua
@@ -28,6 +28,7 @@ local tonumber = tonumber
local pairs = pairs
local table_sort = table.sort
local math_random = math.random
+local ngx_now = ngx.now
local require = require
local pcall = pcall
@@ -532,6 +533,33 @@ local function retry_on_error(ctx, conf, code)
if (code == 429 and fallback_strategy_has(conf.fallback_strategy,
"http_429")) or
(code >= 500 and code < 600 and
fallback_strategy_has(conf.fallback_strategy, "http_5xx")) then
+ -- Slow-failure guard: only retry when the failed attempt finished
within
+ -- retry_on_failure_within_ms. A slow failure (e.g. a 5xx returned
after
+ -- minutes) is given back to the client directly, so fallback never
doubles
+ -- the client's wait time. ctx.llm_request_start_time is reset by base
+ -- before_proxy at the start of every attempt, so this measures the
elapsed
+ -- time of the attempt that just failed.
+ if conf.retry_on_failure_within_ms and ctx.llm_request_start_time then
+ local elapsed_ms = (ngx_now() - ctx.llm_request_start_time) * 1000
+ if elapsed_ms > conf.retry_on_failure_within_ms then
+ core.log.warn("ai instance failed after ", elapsed_ms,
+ "ms, exceeding retry_on_failure_within_ms ",
+ conf.retry_on_failure_within_ms, ", not
retrying")
+ return code
+ end
+ end
+
+ -- Cap the number of fallback retries so a single request does not
exhaust
+ -- every instance when many are configured.
+ if conf.max_retries then
+ ctx.ai_retries = (ctx.ai_retries or 0) + 1
+ if ctx.ai_retries > conf.max_retries then
+ core.log.warn("reached max_retries ", conf.max_retries,
+ ", not retrying")
+ return code
+ end
+ end
+
local name, ai_instance, err = pick_ai_instance(ctx, conf)
if err then
core.log.error("failed to pick new AI instance: ", err)
diff --git a/apisix/plugins/ai-proxy/schema.lua
b/apisix/plugins/ai-proxy/schema.lua
index 79384bdcb..4cec19bb1 100644
--- a/apisix/plugins/ai-proxy/schema.lua
+++ b/apisix/plugins/ai-proxy/schema.lua
@@ -332,6 +332,27 @@ _M.ai_proxy_multi_schema = {
}
}
},
+ max_retries = {
+ type = "integer",
+ minimum = 0,
+ description = "Maximum number of fallback retries after the
initial "
+ .. "request fails. Bounds how many additional instances a
single "
+ .. "request tries, so it does not exhaust every configured "
+ .. "instance. Only takes effect together with
fallback_strategy. "
+ .. "Unset means no explicit cap (retry until an instance
succeeds "
+ .. "or all are tried).",
+ },
+ retry_on_failure_within_ms = {
+ type = "integer",
+ minimum = 1,
+ description = "Only fall back to another instance when the
upstream "
+ .. "fails within this many milliseconds. Fast failures (e.g. "
+ .. "connection errors, quick 429/5xx) are retried; a slow
failure "
+ .. "that takes longer than this is returned to the client
directly "
+ .. "to avoid doubling the wait time. Only takes effect
together "
+ .. "with fallback_strategy. Unset means retry regardless of
how "
+ .. "long the failed attempt took.",
+ },
timeout = {
type = "integer",
minimum = 1,
diff --git a/docs/en/latest/plugins/ai-proxy-multi.md
b/docs/en/latest/plugins/ai-proxy-multi.md
index 771801ba0..fe25e0d77 100644
--- a/docs/en/latest/plugins/ai-proxy-multi.md
+++ b/docs/en/latest/plugins/ai-proxy-multi.md
@@ -68,6 +68,8 @@ When an instance's `provider` is set to `bedrock`, the Plugin
expects requests i
| Name | Type | Required | Default
| Valid values | Description |
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
| fallback_strategy | string or array | False | |
string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br />array:
["rate_limiting", "http_429", "http_5xx"] | Fallback strategy. When set, the
Plugin will check whether the specified instance's token has been exhausted
when a request is forwarded. If so, forward the request to the next instance
regardless of the instance priority. When not set, the Plugin will not forward
the request to low prior [...]
+| max_retries | integer | False |
| greater or equal to 0 | Maximum number of fallback
retries after the initial request fails. Bounds how many additional instances a
single request tries, so it does not exhaust every configured instance. Only
takes effect together with `fallback_strategy`. When unset, the Plugin retries
until an instance succeeds or all are tried. |
+| retry_on_failure_within_ms | integer | False |
| greater or equal to 1 | Only fall back to another
instance when the upstream fails within this many milliseconds. Fast failures
(such as connection errors or quick `429`/`5xx`) are retried, while a slow
failure that takes longer than this is returned to the client directly to avoid
doubling the wait time. Only takes effect together with `fallback_strategy`.
When unset, the Plugin retrie [...]
| balancer | object | False |
| | Load balancing configurations. |
| balancer.algorithm | string | False | roundrobin
| [roundrobin, chash] | Load balancing algorithm. When set
to `roundrobin`, weighted round robin algorithm is used. When set to `chash`,
consistent hashing algorithm is used. |
| balancer.hash_on | string | False |
| [vars, headers, cookie, consumer, vars_combinations] |
Used when `type` is `chash`. Support hashing on [NGINX
variables](https://nginx.org/en/docs/varindex.html), headers, cookie, consumer,
or a combination of [NGINX variables](https://nginx.org/en/docs/varindex.html).
|
diff --git a/docs/zh/latest/plugins/ai-proxy-multi.md
b/docs/zh/latest/plugins/ai-proxy-multi.md
index 8148def3c..a1e59084e 100644
--- a/docs/zh/latest/plugins/ai-proxy-multi.md
+++ b/docs/zh/latest/plugins/ai-proxy-multi.md
@@ -68,6 +68,8 @@ import TabItem from '@theme/TabItem';
| 名称 | 类型 | 必选项 | 默认值
| 有效值 | 描述 |
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
| fallback_strategy | string 或 array | 否 | |
string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br />array:
["rate_limiting", "http_429", "http_5xx"] |
故障转移策略。设置后,插件将在转发请求时检查指定实例的令牌是否已耗尽。如果是,则无论实例优先级如何,都将请求转发到下一个实例。未设置时,当高优先级实例的令牌耗尽时,插件不会将请求转发到低优先级实例。
|
+| max_retries | integer | 否 |
| 大于或等于 0 |
初始请求失败后允许的最大故障转移重试次数。用于限制单个请求最多尝试多少个额外实例,避免穷举所有已配置的实例。仅在配置 `fallback_strategy`
时生效。未设置时,插件会持续重试直到某个实例成功或所有实例都已尝试。 |
+| retry_on_failure_within_ms | integer | 否 |
| 大于或等于 1 | 仅当上游在指定毫秒数内失败时才故障转移到其他实例。快速失败(如连接错误、快速返回的
`429`/`5xx`)会触发重试,而耗时超过该值的慢失败会直接将错误返回给客户端,避免客户端等待时间翻倍。仅在配置 `fallback_strategy`
时生效。未设置时,插件无论失败请求耗时多久都会重试。 |
| balancer | object | 否 |
| | 负载均衡配置。 |
| balancer.algorithm | string | 否 | roundrobin
| [roundrobin, chash] | 负载均衡算法。设置为 `roundrobin` 时,使用加权轮询算法。设置为
`chash` 时,使用一致性哈希算法。 |
| balancer.hash_on | string | 否 |
| [vars, headers, cookie, consumer, vars_combinations] | 当
`type` 为 `chash` 时使用。支持基于 [NGINX
变量](https://nginx.org/en/docs/varindex.html)、标头、cookie、消费者或 [NGINX
变量](https://nginx.org/en/docs/varindex.html)组合进行哈希。 |
diff --git a/t/plugin/ai-proxy-multi-retry.t b/t/plugin/ai-proxy-multi-retry.t
new file mode 100644
index 000000000..a60fd7e10
--- /dev/null
+++ b/t/plugin/ai-proxy-multi-retry.t
@@ -0,0 +1,216 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_root_location();
+
+
+add_block_preprocessor(sub {
+ my ($block) = @_;
+
+ if (!defined $block->request) {
+ $block->set_value("request", "GET /t");
+ }
+
+ my $user_yaml_config = <<_EOC_;
+plugins:
+ - ai-proxy-multi
+ - prometheus
+_EOC_
+ $block->set_value("extra_yaml_config", $user_yaml_config);
+
+ my $http_config = $block->http_config // <<_EOC_;
+ server {
+ server_name fast_internal_error;
+ default_type 'application/json';
+ listen 6731;
+ location / {
+ content_by_lua_block {
+ ngx.status = 500
+ ngx.say([[{ "error": {"message":"fast internal error"}}]])
+ return
+ }
+ }
+ }
+ server {
+ server_name slow_internal_error;
+ default_type 'application/json';
+ listen 6732;
+ location / {
+ content_by_lua_block {
+ ngx.sleep(0.5)
+ ngx.status = 500
+ ngx.say([[{ "error": {"message":"slow internal error"}}]])
+ return
+ }
+ }
+ }
+ server {
+ server_name success_instance;
+ default_type 'application/json';
+ listen 6733;
+ location / {
+ content_by_lua_block {
+ ngx.status = 200
+ ngx.print("success")
+ return
+ }
+ }
+ }
+_EOC_
+
+ $block->set_value("http_config", $http_config);
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: max_retries caps fallback so all instances are not exhausted
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/anything",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy": ["http_5xx"],
+ "max_retries": 1,
+ "balancer": {
+ "algorithm": "roundrobin"
+ },
+ "instances": [
+
{"name":"err-1","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+
{"name":"err-2","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+
{"name":"err-3","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}}
+ ],
+ "ssl_verify": false
+ }
+ }
+ }]]
+ )
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 2: request stops after max_retries and returns the upstream error
(500, not 502)
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- error_code: 500
+--- error_log
+reached max_retries 1
+
+
+
+=== TEST 3: fast failure within retry_on_failure_within_ms still triggers
fallback
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/anything",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy": ["http_5xx"],
+ "retry_on_failure_within_ms": 5000,
+ "instances": [
+
{"name":"fast-err","provider":"openai-compatible","weight":1,"priority":10,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+
{"name":"success","provider":"openai-compatible","weight":1,"priority":0,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6733"}}
+ ],
+ "ssl_verify": false
+ }
+ }
+ }]]
+ )
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 4: fast failure falls back to the healthy instance
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- response_body chomp
+success
+--- error_code: 200
+
+
+
+=== TEST 5: slow failure beyond retry_on_failure_within_ms is returned directly
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/anything",
+ "plugins": {
+ "ai-proxy-multi": {
+ "fallback_strategy": ["http_5xx"],
+ "retry_on_failure_within_ms": 200,
+ "instances": [
+
{"name":"slow-err","provider":"openai-compatible","weight":1,"priority":10,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6732"}},
+
{"name":"success","provider":"openai-compatible","weight":1,"priority":0,"auth":{"header":{"Authorization":"Bearer
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6733"}}
+ ],
+ "ssl_verify": false
+ }
+ }
+ }]]
+ )
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 6: slow failure does not fall back and returns the upstream error
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- error_code: 500
+--- error_log
+exceeding retry_on_failure_within_ms 200