This is an automated email from the ASF dual-hosted git repository.

spacewander pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git


The following commit(s) were added to refs/heads/master by this push:
     new 994f020  feat: enable etcd health-check (#4191)
994f020 is described below

commit 994f0209a17aa614dea7cb3d9695d20b3ed6bb4f
Author: Shuyang Wu <[email protected]>
AuthorDate: Wed Jun 30 09:16:03 2021 -0400

    feat: enable etcd health-check (#4191)
---
 apisix/cli/ngx_tpl.lua                 |  1 +
 apisix/core/config_etcd.lua            | 46 ++++++++++++++++-
 conf/config-default.yaml               |  1 +
 rockspec/apisix-master-0.rockspec      |  2 +-
 t/APISIX.pm                            |  1 +
 t/cli/docker-compose-etcd-cluster.yaml | 69 +++++++++++++++++++++++++
 t/cli/test_etcd_healthcheck.sh         | 92 ++++++++++++++++++++++++++++++++++
 t/core/config_etcd.t                   | 16 +++---
 8 files changed, 217 insertions(+), 11 deletions(-)

diff --git a/apisix/cli/ngx_tpl.lua b/apisix/cli/ngx_tpl.lua
index 3e44b53..ebe0da1 100644
--- a/apisix/cli/ngx_tpl.lua
+++ b/apisix/cli/ngx_tpl.lua
@@ -155,6 +155,7 @@ http {
     lua_shared_dict plugin-limit-count-redis-cluster-slot-lock 1m;
     lua_shared_dict tracing_buffer       10m; # plugin: skywalking
     lua_shared_dict plugin-api-breaker   10m;
+    lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
 
     # for openid-connect and authz-keycloak plugin
     lua_shared_dict discovery             1m; # cache for discovery metadata 
documents
diff --git a/apisix/core/config_etcd.lua b/apisix/core/config_etcd.lua
index a9888a3..38a32e0 100644
--- a/apisix/core/config_etcd.lua
+++ b/apisix/core/config_etcd.lua
@@ -38,12 +38,15 @@ local tostring     = tostring
 local tonumber     = tonumber
 local xpcall       = xpcall
 local debug        = debug
+local string       = string
 local error        = error
 local rand         = math.random
 local constants    = require("apisix.constants")
+local health_check = require("resty.etcd.health_check")
 
 
 local is_http = ngx.config.subsystem == "http"
+local err_etcd_unhealthy_all = "has no healthy etcd endpoint available"
 local created_obj  = {}
 local loaded_configuration = {}
 
@@ -146,7 +149,11 @@ local function waitdir(etcd_cli, key, modified_index, 
timeout)
     end
 
     if type(res.result) ~= "table" then
-        return nil, "failed to wait etcd dir"
+        err = "failed to wait etcd dir"
+        if res.error and res.error.message then
+            err = err .. ": " .. res.error.message
+        end
+        return nil, err
     end
     return etcd_apisix.watch_format(res)
 end
@@ -529,6 +536,18 @@ local function _automatic_fetch(premature, self)
         return
     end
 
+    if not health_check.conf then
+        local _, err = health_check.init({
+            shm_name = "etcd_cluster_health_check",
+            fail_timeout = self.health_check_timeout,
+            max_fails = 3,
+            retry = true,
+        })
+        if err then
+            log.warn("fail to create health_check: " .. err)
+        end
+    end
+
     local i = 0
     while not exiting() and self.running and i <= 32 do
         i = i + 1
@@ -545,7 +564,25 @@ local function _automatic_fetch(premature, self)
 
             local ok, err = sync_data(self)
             if err then
-                if err ~= "timeout" and err ~= "Key not found"
+                if string.find(err, err_etcd_unhealthy_all) then
+                    local reconnected = false
+                    while err and not reconnected and i <= 32 do
+                        local backoff_duration, backoff_factor, backoff_step = 
1, 2, 6
+                        for _ = 1, backoff_step do
+                            i = i + 1
+                            ngx_sleep(backoff_duration)
+                            _, err = sync_data(self)
+                            if not err or not string.find(err, 
err_etcd_unhealthy_all) then
+                                log.warn("reconnected to etcd")
+                                reconnected = true
+                                break
+                            end
+                            backoff_duration = backoff_duration * 
backoff_factor
+                            log.error("no healthy etcd endpoint available, 
next retry after "
+                                       .. backoff_duration .. "s")
+                        end
+                    end
+                elseif err ~= "timeout" and err ~= "Key not found"
                     and self.last_err ~= err then
                     log.error("failed to fetch data from etcd: ", err, ", ",
                               tostring(self))
@@ -594,6 +631,10 @@ function _M.new(key, opts)
     if not resync_delay or resync_delay < 0 then
         resync_delay = 5
     end
+    local health_check_timeout = etcd_conf.health_check_timeout
+    if not health_check_timeout or health_check_timeout < 0 then
+        health_check_timeout = 10
+    end
 
     local automatic = opts and opts.automatic
     local item_schema = opts and opts.item_schema
@@ -618,6 +659,7 @@ function _M.new(key, opts)
         last_err = nil,
         last_err_time = nil,
         resync_delay = resync_delay,
+        health_check_timeout = health_check_timeout,
         timeout = timeout,
         single_item = single_item,
         filter = filter_fun,
diff --git a/conf/config-default.yaml b/conf/config-default.yaml
index 3065646..eedf77f 100644
--- a/conf/config-default.yaml
+++ b/conf/config-default.yaml
@@ -209,6 +209,7 @@ etcd:
   prefix: "/apisix"               # apisix configurations prefix
   timeout: 30                     # 30 seconds
   #resync_delay: 5                # when sync failed and a rest is needed, 
resync after the configured seconds plus 50% random jitter
+  #health_check_timeout: 10       # etcd retry the unhealthy nodes after the 
configured seconds
   #user: root                     # root username for etcd
   #password: 5tHkHhYkjr6cQY       # root password for etcd
   tls:
diff --git a/rockspec/apisix-master-0.rockspec 
b/rockspec/apisix-master-0.rockspec
index f91ba68..f9d73da 100644
--- a/rockspec/apisix-master-0.rockspec
+++ b/rockspec/apisix-master-0.rockspec
@@ -34,7 +34,7 @@ dependencies = {
     "lua-resty-ctxdump = 0.1-0",
     "lua-resty-dns-client = 5.2.0",
     "lua-resty-template = 2.0",
-    "lua-resty-etcd = 1.5.0",
+    "lua-resty-etcd = 1.5.3",
     "lua-resty-balancer = 0.02rc5",
     "lua-resty-ngxvar = 0.5.2",
     "lua-resty-jit-uuid = 0.0.7",
diff --git a/t/APISIX.pm b/t/APISIX.pm
index c09f6cf..e0c97cd 100644
--- a/t/APISIX.pm
+++ b/t/APISIX.pm
@@ -430,6 +430,7 @@ _EOC_
     lua_shared_dict discovery             1m;    # plugin authz-keycloak
     lua_shared_dict plugin-api-breaker   10m;
     lua_capture_error_log                 1m;    # plugin error-log-logger
+    lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
 
     proxy_ssl_name \$upstream_host;
     proxy_ssl_server_name on;
diff --git a/t/cli/docker-compose-etcd-cluster.yaml 
b/t/cli/docker-compose-etcd-cluster.yaml
new file mode 100644
index 0000000..a2fcef7
--- /dev/null
+++ b/t/cli/docker-compose-etcd-cluster.yaml
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+version: "3.7"
+
+services:
+  etcd0:
+    image: "gcr.io/etcd-development/etcd:v3.4.15"
+    container_name: etcd0
+    ports:
+      - "23800:2380"
+      - "23790:2379"
+    environment:
+      - ALLOW_NONE_AUTHENTICATION=yes
+      - ETCD_NAME=etcd0
+      - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+      - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+      - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23790
+      - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd0:2380
+      - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+      - 
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+      - ETCD_INITIAL_CLUSTER_STATE=new
+
+  etcd1:
+    image: "gcr.io/etcd-development/etcd:v3.4.15"
+    container_name: etcd1
+    ports:
+      - "23801:2380"
+      - "23791:2379"
+    environment:
+      - ALLOW_NONE_AUTHENTICATION=yes
+      - ETCD_NAME=etcd1
+      - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+      - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+      - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23791
+      - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd1:2380
+      - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+      - 
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+      - ETCD_INITIAL_CLUSTER_STATE=new
+
+  etcd2:
+    image: "gcr.io/etcd-development/etcd:v3.4.15"
+    container_name: etcd2
+    ports:
+      - "23802:2380"
+      - "23792:2379"
+    environment:
+      - ALLOW_NONE_AUTHENTICATION=yes
+      - ETCD_NAME=etcd2
+      - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
+      - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
+      - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23792
+      - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd2:2380
+      - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
+      - 
ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
+      - ETCD_INITIAL_CLUSTER_STATE=new
diff --git a/t/cli/test_etcd_healthcheck.sh b/t/cli/test_etcd_healthcheck.sh
new file mode 100755
index 0000000..62498f1
--- /dev/null
+++ b/t/cli/test_etcd_healthcheck.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+. ./t/cli/common.sh
+
+# create 3 node etcd cluster in docker
+ETCD_NAME_0=etcd0
+ETCD_NAME_1=etcd1
+ETCD_NAME_2=etcd2
+HEALTH_CHECK_RETRY_TIMEOUT=10
+
+echo '
+etcd:
+  host:
+    - "http://127.0.0.1:23790";
+    - "http://127.0.0.1:23791";
+    - "http://127.0.0.1:23792";
+  health_check_timeout: '"$HEALTH_CHECK_RETRY_TIMEOUT"'
+' > conf/config.yaml
+
+docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d
+
+# Check apisix not got effected when one etcd node disconnected
+make init && make run
+
+docker stop ${ETCD_NAME_0}
+code=$(curl -o /dev/null -s -w %{http_code} 
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: 
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+    echo "failed: apisix got effect when one etcd node out of a cluster 
disconnected"
+    exit 1
+fi
+docker start ${ETCD_NAME_0}
+
+docker stop ${ETCD_NAME_1}
+code=$(curl -o /dev/null -s -w %{http_code} 
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: 
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+    echo "failed: apisix got effect when one etcd node out of a cluster 
disconnected"
+    exit 1
+fi
+docker start ${ETCD_NAME_1}
+
+make stop
+
+echo "passed: apisix not got effected when one etcd node disconnected"
+
+# Check when all etcd nodes disconnected, apisix trying to reconnect with 
backoff, and could successfully recover when reconnected
+make init && make run
+
+docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop 
${ETCD_NAME_2}
+
+sleep_till=$(date +%s -d "$DATE + $HEALTH_CHECK_RETRY_TIMEOUT second")
+
+code=$(curl -o /dev/null -s -w %{http_code} 
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: 
edd1c9f034335f136f87ad84b625c8f1')
+if [ $code -eq 200 ]; then
+    echo "failed: apisix not got effect when all etcd nodes disconnected"
+    exit 1
+fi
+
+docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start 
${ETCD_NAME_2}
+
+# sleep till etcd health check try to check again
+current_time=$(date +%s)
+sleep_seconds=$(( $sleep_till - $current_time ))
+if [ "$sleep_seconds" -gt 0 ]; then
+    sleep $sleep_seconds
+fi
+
+code=$(curl -o /dev/null -s -w %{http_code} 
http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: 
edd1c9f034335f136f87ad84b625c8f1')
+if [ ! $code -eq 200 ]; then
+    echo "failed: apisix could not recover when etcd node recover"
+    exit 1
+fi
+
+make stop
+
+echo "passed: when all etcd nodes disconnected, apisix trying to reconnect 
with backoff, and could successfully recover when reconnected"
diff --git a/t/core/config_etcd.t b/t/core/config_etcd.t
index fbffc9f..edab98f 100644
--- a/t/core/config_etcd.t
+++ b/t/core/config_etcd.t
@@ -44,9 +44,9 @@ etcd:
 --- request
 GET /t
 --- grep_error_log eval
-qr{failed to fetch data from etcd: connection refused,  etcd key: .*routes}
+qr{connection refused}
 --- grep_error_log_out eval
-qr/(failed to fetch data from etcd: connection refused,  etcd key: 
.*routes\n){1,}/
+qr/(connection refused){1,}/
 
 
 
@@ -68,9 +68,9 @@ etcd:
 --- request
 GET /t
 --- grep_error_log chop
-failed to fetch data from etcd: handshake failed
+handshake failed
 --- grep_error_log_out eval
-qr/(failed to fetch data from etcd: handshake failed){1,}/
+qr/(handshake failed){1,}/
 
 
 
@@ -92,9 +92,9 @@ etcd:
 --- request
 GET /t
 --- grep_error_log chop
-failed to fetch data from etcd: closed
+closed
 --- grep_error_log_out eval
-qr/(failed to fetch data from etcd: closed){1,}/
+qr/(closed){1,}/
 
 
 
@@ -116,9 +116,9 @@ etcd:
 --- request
 GET /t
 --- grep_error_log chop
-failed to fetch data from etcd: 18: self signed certificate
+18: self signed certificate
 --- grep_error_log_out eval
-qr/(failed to fetch data from etcd: 18: self signed certificate){1,}/
+qr/(18: self signed certificate){1,}/
 
 
 

Reply via email to