This is an automated email from the ASF dual-hosted git repository.
tianxiaoliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/servicecomb-service-center.git
The following commit(s) were added to refs/heads/master by this push:
new 2f25dc8 Fix health check bug (#685)
2f25dc8 is described below
commit 2f25dc83abf6f8705d77b5091e6a29d7d7435dcd
Author: humingcheng <[email protected]>
AuthorDate: Fri Sep 4 17:13:52 2020 +0800
Fix health check bug (#685)
---
server/plugin/registry/etcd/common.go | 3 +-
server/plugin/registry/etcd/etcd.go | 47 +++++++++++++++++---------------
server/plugin/registry/etcd/etcd_test.go | 1 -
3 files changed, 27 insertions(+), 24 deletions(-)
diff --git a/server/plugin/registry/etcd/common.go
b/server/plugin/registry/etcd/common.go
index 12e6b59..b296c1f 100644
--- a/server/plugin/registry/etcd/common.go
+++ b/server/plugin/registry/etcd/common.go
@@ -23,7 +23,8 @@ import (
const (
// here will new an etcd connection after about 30s(=5s * 3 +
(backoff:8s))
// when the connected etcd member was hung but tcp is still alive
- healthCheckTimeout = 5 * time.Second
+ healthCheckTimeout = 5 * time.Second
+ healthCheckRetryTimes = 3
// see google.golang.org/grpc/keepalive/keepalive.go
// after a duration of this time if the client doesn't see any activity
diff --git a/server/plugin/registry/etcd/etcd.go
b/server/plugin/registry/etcd/etcd.go
index 6251e35..159204f 100644
--- a/server/plugin/registry/etcd/etcd.go
+++ b/server/plugin/registry/etcd/etcd.go
@@ -723,40 +723,43 @@ func (c *Client) HealthCheck() {
}
func (c *Client) healthCheckLoop(pctx context.Context) {
- retries, start := 0, time.Now()
d := c.AutoSyncInterval
for {
+ var healthCheckErr error
select {
case <-pctx.Done():
return
case <-time.After(d):
- var err error
- ctx, cancel := context.WithTimeout(c.Client.Ctx(),
healthCheckTimeout)
- defer cancel()
- if err = c.SyncMembers(ctx); err != nil {
- d := backoff.GetBackoff().Delay(retries)
- retries++
- log.Errorf(err, "retry to sync members from
etcd %s after %s", c.Endpoints, d)
+ for i := 0; i < healthCheckRetryTimes; i++ {
+ ctx, cancel :=
context.WithTimeout(c.Client.Ctx(), healthCheckTimeout)
+ healthCheckErr = c.SyncMembers(ctx)
+ cancel()
+ if healthCheckErr == nil {
+ break
+ }
+ d := backoff.GetBackoff().Delay(i)
+ log.Errorf(healthCheckErr, "retry to sync
members from etcd %s after %s", c.Endpoints, d)
select {
case <-pctx.Done():
return
- default:
- continue
- }
- } else {
- log.Info("sync members ok.")
- if err :=
alarm.Clear(alarm.IDBackendConnectionRefuse); err != nil {
- log.Error("", err)
- }
- if cerr := c.ReOpen(); cerr != nil {
- log.Errorf(cerr, "retry to health check
etcd %s after %s", c.Endpoints, c.AutoSyncInterval)
- } else {
- log.Infof("[%s]re-connected to etcd
%s", time.Since(start), c.Endpoints)
- continue
+ case <-time.After(d):
}
- return
}
+ }
+
+ var alarmErr error
+ if healthCheckErr != nil {
+ log.Error("etcd health check failed", healthCheckErr)
+ alarmErr = alarm.Raise(alarm.IDBackendConnectionRefuse,
alarm.AdditionalContext(healthCheckErr.Error()))
+ if err := c.ReOpen(); err != nil {
+ log.Error("re-connect to etcd failed", err)
+ }
+ } else {
+ alarmErr = alarm.Clear(alarm.IDBackendConnectionRefuse)
+ }
+ if alarmErr != nil {
+ log.Error("alarm failed", alarmErr)
}
}
}
diff --git a/server/plugin/registry/etcd/etcd_test.go
b/server/plugin/registry/etcd/etcd_test.go
index 8edc293..bfdee00 100644
--- a/server/plugin/registry/etcd/etcd_test.go
+++ b/server/plugin/registry/etcd/etcd_test.go
@@ -565,7 +565,6 @@ func TestEtcdClient_HealthCheck(t *testing.T) {
etcdc.Endpoints = []string{endpoint}
- etcdc.Close()
ctx, _ = context.WithTimeout(context.Background(), 1*time.Second)
go etcdc.healthCheckLoop(ctx)
for {