(apisix) branch master updated: feat(ai-proxy-multi): add max_retries and retry_on_failure_within_ms for fallback (#13495)

alinsran Tue, 09 Jun 2026 20:12:16 -0700

This is an automated email from the ASF dual-hosted git repository.

AlinsRan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git



The following commit(s) were added to refs/heads/master by this push:
     new 100736686 feat(ai-proxy-multi): add max_retries and 
retry_on_failure_within_ms for fallback (#13495)
100736686 is described below

commit 100736686bde281a72b4c0fc9812b737fde5149f
Author: AlinsRan <[email protected]>
AuthorDate: Wed Jun 10 11:11:55 2026 +0800

    feat(ai-proxy-multi): add max_retries and retry_on_failure_within_ms for 
fallback (#13495)
---
 apisix/plugins/ai-proxy-multi.lua        |  28 ++++
 apisix/plugins/ai-proxy/schema.lua       |  21 +++
 docs/en/latest/plugins/ai-proxy-multi.md |   2 +
 docs/zh/latest/plugins/ai-proxy-multi.md |   2 +
 t/plugin/ai-proxy-multi-retry.t          | 216 +++++++++++++++++++++++++++++++
 5 files changed, 269 insertions(+)

diff --git a/apisix/plugins/ai-proxy-multi.lua 
b/apisix/plugins/ai-proxy-multi.lua
index 3c6ac9330..feffa77c5 100644
--- a/apisix/plugins/ai-proxy-multi.lua
+++ b/apisix/plugins/ai-proxy-multi.lua
@@ -28,6 +28,7 @@ local tonumber = tonumber
 local pairs = pairs
 local table_sort = table.sort
 local math_random = math.random
+local ngx_now = ngx.now
 
 local require = require
 local pcall = pcall
@@ -532,6 +533,33 @@ local function retry_on_error(ctx, conf, code)
     if (code == 429 and fallback_strategy_has(conf.fallback_strategy, 
"http_429")) or
        (code >= 500 and code < 600 and
        fallback_strategy_has(conf.fallback_strategy, "http_5xx")) then
+        -- Slow-failure guard: only retry when the failed attempt finished 
within
+        -- retry_on_failure_within_ms. A slow failure (e.g. a 5xx returned 
after
+        -- minutes) is given back to the client directly, so fallback never 
doubles
+        -- the client's wait time. ctx.llm_request_start_time is reset by base
+        -- before_proxy at the start of every attempt, so this measures the 
elapsed
+        -- time of the attempt that just failed.
+        if conf.retry_on_failure_within_ms and ctx.llm_request_start_time then
+            local elapsed_ms = (ngx_now() - ctx.llm_request_start_time) * 1000
+            if elapsed_ms > conf.retry_on_failure_within_ms then
+                core.log.warn("ai instance failed after ", elapsed_ms,
+                              "ms, exceeding retry_on_failure_within_ms ",
+                              conf.retry_on_failure_within_ms, ", not 
retrying")
+                return code
+            end
+        end
+
+        -- Cap the number of fallback retries so a single request does not 
exhaust
+        -- every instance when many are configured.
+        if conf.max_retries then
+            ctx.ai_retries = (ctx.ai_retries or 0) + 1
+            if ctx.ai_retries > conf.max_retries then
+                core.log.warn("reached max_retries ", conf.max_retries,
+                              ", not retrying")
+                return code
+            end
+        end
+
         local name, ai_instance, err = pick_ai_instance(ctx, conf)
         if err then
             core.log.error("failed to pick new AI instance: ", err)
diff --git a/apisix/plugins/ai-proxy/schema.lua 
b/apisix/plugins/ai-proxy/schema.lua
index 79384bdcb..4cec19bb1 100644
--- a/apisix/plugins/ai-proxy/schema.lua
+++ b/apisix/plugins/ai-proxy/schema.lua
@@ -332,6 +332,27 @@ _M.ai_proxy_multi_schema = {
               }
             }
         },
+        max_retries = {
+            type = "integer",
+            minimum = 0,
+            description = "Maximum number of fallback retries after the 
initial "
+                .. "request fails. Bounds how many additional instances a 
single "
+                .. "request tries, so it does not exhaust every configured "
+                .. "instance. Only takes effect together with 
fallback_strategy. "
+                .. "Unset means no explicit cap (retry until an instance 
succeeds "
+                .. "or all are tried).",
+        },
+        retry_on_failure_within_ms = {
+            type = "integer",
+            minimum = 1,
+            description = "Only fall back to another instance when the 
upstream "
+                .. "fails within this many milliseconds. Fast failures (e.g. "
+                .. "connection errors, quick 429/5xx) are retried; a slow 
failure "
+                .. "that takes longer than this is returned to the client 
directly "
+                .. "to avoid doubling the wait time. Only takes effect 
together "
+                .. "with fallback_strategy. Unset means retry regardless of 
how "
+                .. "long the failed attempt took.",
+        },
         timeout = {
             type = "integer",
             minimum = 1,
diff --git a/docs/en/latest/plugins/ai-proxy-multi.md 
b/docs/en/latest/plugins/ai-proxy-multi.md
index 771801ba0..fe25e0d77 100644
--- a/docs/en/latest/plugins/ai-proxy-multi.md
+++ b/docs/en/latest/plugins/ai-proxy-multi.md
@@ -68,6 +68,8 @@ When an instance's `provider` is set to `bedrock`, the Plugin 
expects requests i
 | Name                               | Type            | Required | Default    
                       | Valid values | Description |
 
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
 | fallback_strategy                  | string or array         | False    |  | 
string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br />array: 
["rate_limiting", "http_429", "http_5xx"] | Fallback strategy. When set, the 
Plugin will check whether the specified instance's token has been exhausted 
when a request is forwarded. If so, forward the request to the next instance 
regardless of the instance priority. When not set, the Plugin will not forward 
the request to low prior [...]
+| max_retries                        | integer        | False    |             
                      | greater or equal to 0 | Maximum number of fallback 
retries after the initial request fails. Bounds how many additional instances a 
single request tries, so it does not exhaust every configured instance. Only 
takes effect together with `fallback_strategy`. When unset, the Plugin retries 
until an instance succeeds or all are tried. |
+| retry_on_failure_within_ms         | integer        | False    |             
                      | greater or equal to 1 | Only fall back to another 
instance when the upstream fails within this many milliseconds. Fast failures 
(such as connection errors or quick `429`/`5xx`) are retried, while a slow 
failure that takes longer than this is returned to the client directly to avoid 
doubling the wait time. Only takes effect together with `fallback_strategy`. 
When unset, the Plugin retrie [...]
 | balancer                           | object         | False    |             
                      |              | Load balancing configurations. |
 | balancer.algorithm                 | string         | False    | roundrobin  
                   | [roundrobin, chash] | Load balancing algorithm. When set 
to `roundrobin`, weighted round robin algorithm is used. When set to `chash`, 
consistent hashing algorithm is used. |
 | balancer.hash_on                   | string         | False    |             
                      | [vars, headers, cookie, consumer, vars_combinations] | 
Used when `type` is `chash`. Support hashing on [NGINX 
variables](https://nginx.org/en/docs/varindex.html), headers, cookie, consumer, 
or a combination of [NGINX variables](https://nginx.org/en/docs/varindex.html). 
|
diff --git a/docs/zh/latest/plugins/ai-proxy-multi.md 
b/docs/zh/latest/plugins/ai-proxy-multi.md
index 8148def3c..a1e59084e 100644
--- a/docs/zh/latest/plugins/ai-proxy-multi.md
+++ b/docs/zh/latest/plugins/ai-proxy-multi.md
@@ -68,6 +68,8 @@ import TabItem from '@theme/TabItem';
 | 名称                               | 类型            | 必选项 | 默认值                 
          | 有效值 | 描述 |
 
|------------------------------------|----------------|----------|-----------------------------------|--------------|-------------|
 | fallback_strategy                  | string 或 array         | 否    |  | 
string: "instance_health_and_rate_limiting", "http_429", "http_5xx"<br />array: 
["rate_limiting", "http_429", "http_5xx"] | 
故障转移策略。设置后，插件将在转发请求时检查指定实例的令牌是否已耗尽。如果是，则无论实例优先级如何，都将请求转发到下一个实例。未设置时，当高优先级实例的令牌耗尽时，插件不会将请求转发到低优先级实例。
 |
+| max_retries                        | integer        | 否    |                 
                  | 大于或等于 0 | 
初始请求失败后允许的最大故障转移重试次数。用于限制单个请求最多尝试多少个额外实例，避免穷举所有已配置的实例。仅在配置 `fallback_strategy` 
时生效。未设置时，插件会持续重试直到某个实例成功或所有实例都已尝试。 |
+| retry_on_failure_within_ms         | integer        | 否    |                 
                  | 大于或等于 1 | 仅当上游在指定毫秒数内失败时才故障转移到其他实例。快速失败（如连接错误、快速返回的 
`429`/`5xx`）会触发重试，而耗时超过该值的慢失败会直接将错误返回给客户端，避免客户端等待时间翻倍。仅在配置 `fallback_strategy` 
时生效。未设置时，插件无论失败请求耗时多久都会重试。 |
 | balancer                           | object         | 否    |                 
                  |              | 负载均衡配置。 |
 | balancer.algorithm                 | string         | 否    | roundrobin      
               | [roundrobin, chash] | 负载均衡算法。设置为 `roundrobin` 时，使用加权轮询算法。设置为 
`chash` 时，使用一致性哈希算法。 |
 | balancer.hash_on                   | string         | 否    |                 
                  | [vars, headers, cookie, consumer, vars_combinations] | 当 
`type` 为 `chash` 时使用。支持基于 [NGINX 
变量](https://nginx.org/en/docs/varindex.html)、标头、cookie、消费者或 [NGINX 
变量](https://nginx.org/en/docs/varindex.html)组合进行哈希。 |
diff --git a/t/plugin/ai-proxy-multi-retry.t b/t/plugin/ai-proxy-multi-retry.t
new file mode 100644
index 000000000..a60fd7e10
--- /dev/null
+++ b/t/plugin/ai-proxy-multi-retry.t
@@ -0,0 +1,216 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_root_location();
+
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    my $user_yaml_config = <<_EOC_;
+plugins:
+  - ai-proxy-multi
+  - prometheus
+_EOC_
+    $block->set_value("extra_yaml_config", $user_yaml_config);
+
+    my $http_config = $block->http_config // <<_EOC_;
+        server {
+            server_name fast_internal_error;
+            default_type 'application/json';
+            listen 6731;
+            location / {
+              content_by_lua_block {
+                ngx.status = 500
+                ngx.say([[{ "error": {"message":"fast internal error"}}]])
+                return
+              }
+            }
+        }
+        server {
+            server_name slow_internal_error;
+            default_type 'application/json';
+            listen 6732;
+            location / {
+              content_by_lua_block {
+                ngx.sleep(0.5)
+                ngx.status = 500
+                ngx.say([[{ "error": {"message":"slow internal error"}}]])
+                return
+              }
+            }
+        }
+        server {
+            server_name success_instance;
+            default_type 'application/json';
+            listen 6733;
+            location / {
+              content_by_lua_block {
+                ngx.status = 200
+                ngx.print("success")
+                return
+              }
+            }
+        }
+_EOC_
+
+    $block->set_value("http_config", $http_config);
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: max_retries caps fallback so all instances are not exhausted
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/anything",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": ["http_5xx"],
+                            "max_retries": 1,
+                            "balancer": {
+                                "algorithm": "roundrobin"
+                            },
+                            "instances": [
+                                
{"name":"err-1","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+                                
{"name":"err-2","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+                                
{"name":"err-3","provider":"openai-compatible","weight":1,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}}
+                            ],
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: request stops after max_retries and returns the upstream error 
(500, not 502)
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- error_code: 500
+--- error_log
+reached max_retries 1
+
+
+
+=== TEST 3: fast failure within retry_on_failure_within_ms still triggers 
fallback
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/anything",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": ["http_5xx"],
+                            "retry_on_failure_within_ms": 5000,
+                            "instances": [
+                                
{"name":"fast-err","provider":"openai-compatible","weight":1,"priority":10,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6731"}},
+                                
{"name":"success","provider":"openai-compatible","weight":1,"priority":0,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6733"}}
+                            ],
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 4: fast failure falls back to the healthy instance
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- response_body chomp
+success
+--- error_code: 200
+
+
+
+=== TEST 5: slow failure beyond retry_on_failure_within_ms is returned directly
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/anything",
+                    "plugins": {
+                        "ai-proxy-multi": {
+                            "fallback_strategy": ["http_5xx"],
+                            "retry_on_failure_within_ms": 200,
+                            "instances": [
+                                
{"name":"slow-err","provider":"openai-compatible","weight":1,"priority":10,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6732"}},
+                                
{"name":"success","provider":"openai-compatible","weight":1,"priority":0,"auth":{"header":{"Authorization":"Bearer
 
token"}},"options":{"model":"gpt-4"},"override":{"endpoint":"http://127.0.0.1:6733"}}
+                            ],
+                            "ssl_verify": false
+                        }
+                    }
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 6: slow failure does not fall back and returns the upstream error
+--- request
+POST /anything
+{ "messages": [ { "role": "user", "content": "What is 1+1?"} ] }
+--- error_code: 500
+--- error_log
+exceeding retry_on_failure_within_ms 200

(apisix) branch master updated: feat(ai-proxy-multi): add max_retries and retry_on_failure_within_ms for fallback (#13495)

Reply via email to