This is an automated email from the ASF dual-hosted git repository.
spacewander pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new 994f020 feat: enable etcd health-check (#4191)
994f020 is described below
commit 994f0209a17aa614dea7cb3d9695d20b3ed6bb4f
Author: Shuyang Wu <[email protected]>
AuthorDate: Wed Jun 30 09:16:03 2021 -0400
feat: enable etcd health-check (#4191)
---
apisix/cli/ngx_tpl.lua | 1 +
apisix/core/config_etcd.lua | 46 ++++++++++++++++-
conf/config-default.yaml | 1 +
rockspec/apisix-master-0.rockspec | 2 +-
t/APISIX.pm | 1 +
t/cli/docker-compose-etcd-cluster.yaml | 69 +++++++++++++++++++++++++
t/cli/test_etcd_healthcheck.sh | 92 ++++++++++++++++++++++++++++++++++
t/core/config_etcd.t | 16 +++---
8 files changed, 217 insertions(+), 11 deletions(-)
diff --git a/apisix/cli/ngx_tpl.lua b/apisix/cli/ngx_tpl.lua
index 3e44b53..ebe0da1 100644
--- a/apisix/cli/ngx_tpl.lua
+++ b/apisix/cli/ngx_tpl.lua
@@ -155,6 +155,7 @@ http {
lua_shared_dict plugin-limit-count-redis-cluster-slot-lock 1m;
lua_shared_dict tracing_buffer 10m; # plugin: skywalking
lua_shared_dict plugin-api-breaker 10m;
+ lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
# for openid-connect and authz-keycloak plugin
lua_shared_dict discovery 1m; # cache for discovery metadata
documents
diff --git a/apisix/core/config_etcd.lua b/apisix/core/config_etcd.lua
index a9888a3..38a32e0 100644
--- a/apisix/core/config_etcd.lua
+++ b/apisix/core/config_etcd.lua
@@ -38,12 +38,15 @@ local tostring = tostring
local tonumber = tonumber
local xpcall = xpcall
local debug = debug
+local string = string
local error = error
local rand = math.random
local constants = require("apisix.constants")
+local health_check = require("resty.etcd.health_check")
local is_http = ngx.config.subsystem == "http"
+local err_etcd_unhealthy_all = "has no healthy etcd endpoint available"
local created_obj = {}
local loaded_configuration = {}
@@ -146,7 +149,11 @@ local function waitdir(etcd_cli, key, modified_index,
timeout)
end
if type(res.result) ~= "table" then
- return nil, "failed to wait etcd dir"
+ err = "failed to wait etcd dir"
+ if res.error and res.error.message then
+ err = err .. ": " .. res.error.message
+ end
+ return nil, err
end
return etcd_apisix.watch_format(res)
end
@@ -529,6 +536,18 @@ local function _automatic_fetch(premature, self)
return
end
+ if not health_check.conf then
+ local _, err = health_check.init({
+ shm_name = "etcd_cluster_health_check",
+ fail_timeout = self.health_check_timeout,
+ max_fails = 3,
+ retry = true,
+ })
+ if err then
+ log.warn("fail to create health_check: " .. err)
+ end
+ end
+
local i = 0
while not exiting() and self.running and i <= 32 do
i = i + 1
@@ -545,7 +564,25 @@ local function _automatic_fetch(premature, self)
local ok, err = sync_data(self)
if err then
- if err ~= "timeout" and err ~= "Key not found"
+ if string.find(err, err_etcd_unhealthy_all) then
+ local reconnected = false
+ while err and not reconnected and i <= 32 do
+ local backoff_duration, backoff_factor, backoff_step =
1, 2, 6
+ for _ = 1, backoff_step do
+ i = i + 1
+ ngx_sleep(backoff_duration)
+ _, err = sync_data(self)
+ if not err or not string.find(err,
err_etcd_unhealthy_all) then
+ log.warn("reconnected to etcd")
+ reconnected = true
+ break
+ end
+ backoff_duration = backoff_duration *
backoff_factor
+ log.error("no healthy etcd endpoint available,
next retry after "
+ .. backoff_duration .. "s")
+ end
+ end
+ elseif err ~= "timeout" and err ~= "Key not found"
and self.last_err ~= err then
log.error("failed to fetch data from etcd: ", err, ", ",
tostring(self))
@@ -594,6 +631,10 @@ function _M.new(key, opts)
if not resync_delay or resync_delay < 0 then
resync_delay = 5
end
+ local health_check_timeout = etcd_conf.health_check_timeout
+ if not health_check_timeout or health_check_timeout < 0 then
+ health_check_timeout = 10
+ end
local automatic = opts and opts.automatic
local item_schema = opts and opts.item_schema
@@ -618,6 +659,7 @@ function _M.new(key, opts)
last_err = nil,
last_err_time = nil,
resync_delay = resync_delay,
+ health_check_timeout = health_check_timeout,
timeout = timeout,
single_item = single_item,
filter = filter_fun,
diff --git a/conf/config-default.yaml b/conf/config-default.yaml
index 3065646..eedf77f 100644
--- a/conf/config-default.yaml
+++ b/conf/config-default.yaml
@@ -209,6 +209,7 @@ etcd:
prefix: "/apisix" # apisix configurations prefix
timeout: 30 # 30 seconds
#resync_delay: 5 # when sync failed and a rest is needed,
resync after the configured seconds plus 50% random jitter
+ #health_check_timeout: 10 # etcd retry the unhealthy nodes after the
configured seconds
#user: root # root username for etcd
#password: 5tHkHhYkjr6cQY # root password for etcd
tls:
diff --git a/rockspec/apisix-master-0.rockspec
b/rockspec/apisix-master-0.rockspec
index f91ba68..f9d73da 100644
--- a/rockspec/apisix-master-0.rockspec
+++ b/rockspec/apisix-master-0.rockspec
@@ -34,7 +34,7 @@ dependencies = {
"lua-resty-ctxdump = 0.1-0",
"lua-resty-dns-client = 5.2.0",
"lua-resty-template = 2.0",
- "lua-resty-etcd = 1.5.0",
+ "lua-resty-etcd = 1.5.3",
"lua-resty-balancer = 0.02rc5",
"lua-resty-ngxvar = 0.5.2",
"lua-resty-jit-uuid = 0.0.7",
diff --git a/t/APISIX.pm b/t/APISIX.pm
index c09f6cf..e0c97cd 100644
--- a/t/APISIX.pm
+++ b/t/APISIX.pm
@@ -430,6 +430,7 @@ _EOC_
lua_shared_dict discovery 1m; # plugin authz-keycloak
lua_shared_dict plugin-api-breaker 10m;
lua_capture_error_log 1m; # plugin error-log-logger
+ lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
proxy_ssl_name \$upstream_host;
proxy_ssl_server_name on;
diff --git a/t/cli/docker-compose-etcd-cluster.yaml
b/t/cli/docker-compose-etcd-cluster.yaml
new file mode 100644
index 0000000..a2fcef7
--- /dev/null
+++ b/t/cli/docker-compose-etcd-cluster.yaml
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+version: "3.7"
+
+services:
+ etcd0:
+ image: "gcr.io/etcd-development/etcd:v3.4.15"
+ container_name: etcd0
+ ports:
+ - "23800:2380"
+ - "23790:2379"
+ environment:
+ - ALLOW_NONE_AUTHENTICATION=yes
+ - ETCD_NAME=etcd0
+ - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+ - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+ - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23790
+ - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd0:2380
+ - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+ -
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+ - ETCD_INITIAL_CLUSTER_STATE=new
+
+ etcd1:
+ image: "gcr.io/etcd-development/etcd:v3.4.15"
+ container_name: etcd1
+ ports:
+ - "23801:2380"
+ - "23791:2379"
+ environment:
+ - ALLOW_NONE_AUTHENTICATION=yes
+ - ETCD_NAME=etcd1
+ - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+ - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+ - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23791
+ - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd1:2380
+ - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+ -
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+ - ETCD_INITIAL_CLUSTER_STATE=new
+
+ etcd2:
+ image: "gcr.io/etcd-development/etcd:v3.4.15"
+ container_name: etcd2
+ ports:
+ - "23802:2380"
+ - "23792:2379"
+ environment:
+ - ALLOW_NONE_AUTHENTICATION=yes
+ - ETCD_NAME=etcd2
+ - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+ - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+ - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23792
+ - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd2:2380
+ - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+ -
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+ - ETCD_INITIAL_CLUSTER_STATE=new
diff --git a/t/cli/test_etcd_healthcheck.sh b/t/cli/test_etcd_healthcheck.sh
new file mode 100755
index 0000000..62498f1
--- /dev/null
+++ b/t/cli/test_etcd_healthcheck.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+. ./t/cli/common.sh
+
+# create 3 node etcd cluster in docker
+ETCD_NAME_0=etcd0
+ETCD_NAME_1=etcd1
+ETCD_NAME_2=etcd2
+HEALTH_CHECK_RETRY_TIMEOUT=10
+
+echo '
+etcd:
+ host:
+ - "http://127.0.0.1:23790"
+ - "http://127.0.0.1:23791"
+ - "http://127.0.0.1:23792"
+ health_check_timeout: '"$HEALTH_CHECK_RETRY_TIMEOUT"'
+' > conf/config.yaml
+
+docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d
+
+# Check apisix not got effected when one etcd node disconnected
+make init && make run
+
+docker stop ${ETCD_NAME_0}
+code=$(curl -o /dev/null -s -w %{http_code}
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY:
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+ echo "failed: apisix got effect when one etcd node out of a cluster
disconnected"
+ exit 1
+fi
+docker start ${ETCD_NAME_0}
+
+docker stop ${ETCD_NAME_1}
+code=$(curl -o /dev/null -s -w %{http_code}
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY:
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+ echo "failed: apisix got effect when one etcd node out of a cluster
disconnected"
+ exit 1
+fi
+docker start ${ETCD_NAME_1}
+
+make stop
+
+echo "passed: apisix not got effected when one etcd node disconnected"
+
+# Check when all etcd nodes disconnected, apisix trying to reconnect with
backoff, and could successfully recover when reconnected
+make init && make run
+
+docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop
${ETCD_NAME_2}
+
+sleep_till=$(date +%s -d "$DATE + $HEALTH_CHECK_RETRY_TIMEOUT second")
+
+code=$(curl -o /dev/null -s -w %{http_code}
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY:
edd1c9f034335f136f87ad84b625c8f1')
+if [ $code -eq 200 ]; then
+ echo "failed: apisix not got effect when all etcd nodes disconnected"
+ exit 1
+fi
+
+docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start
${ETCD_NAME_2}
+
+# sleep till etcd health check try to check again
+current_time=$(date +%s)
+sleep_seconds=$(( $sleep_till - $current_time ))
+if [ "$sleep_seconds" -gt 0 ]; then
+ sleep $sleep_seconds
+fi
+
+code=$(curl -o /dev/null -s -w %{http_code}
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY:
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+ echo "failed: apisix could not recover when etcd node recover"
+ exit 1
+fi
+
+make stop
+
+echo "passed: when all etcd nodes disconnected, apisix trying to reconnect
with backoff, and could successfully recover when reconnected"
diff --git a/t/core/config_etcd.t b/t/core/config_etcd.t
index fbffc9f..edab98f 100644
--- a/t/core/config_etcd.t
+++ b/t/core/config_etcd.t
@@ -44,9 +44,9 @@ etcd:
--- request
GET /t
--- grep_error_log eval
-qr{failed to fetch data from etcd: connection refused, etcd key: .*routes}
+qr{connection refused}
--- grep_error_log_out eval
-qr/(failed to fetch data from etcd: connection refused, etcd key:
.*routes\n){1,}/
+qr/(connection refused){1,}/
@@ -68,9 +68,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
-failed to fetch data from etcd: handshake failed
+handshake failed
--- grep_error_log_out eval
-qr/(failed to fetch data from etcd: handshake failed){1,}/
+qr/(handshake failed){1,}/
@@ -92,9 +92,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
-failed to fetch data from etcd: closed
+closed
--- grep_error_log_out eval
-qr/(failed to fetch data from etcd: closed){1,}/
+qr/(closed){1,}/
@@ -116,9 +116,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
-failed to fetch data from etcd: 18: self signed certificate
+18: self signed certificate
--- grep_error_log_out eval
-qr/(failed to fetch data from etcd: 18: self signed certificate){1,}/
+qr/(18: self signed certificate){1,}/