The purpose of reconnect_deadline__() function is twofold:
1. Its result is used to tell if the state has to be changed right now
in reconnect_run().
2. Its result also used to determine when the process need to wake up
and call reconnect_run() for a next time, i.e. when the state may
need to be changed next time.
Since introduction of the 'receive-attempted' feature, the function
returns LLONG_MAX if the deadline is in the future. That works for
the first case, but doesn't for the second one, because we don't
really know when we need to call reconnect_run().
This is the problem for applications where jsonrpc connection is the
only source of wake ups, e.g. ovn-northd. When the network goes down
silently, e.g. server looses IP address due to DHCP failure, ovn-northd
will sleep in the poll loop indefinitely after being told that it
doesn't need to call reconnect_run() (deadline == LLONG_MAX).
Fixing that by actually returning the expected time if it is in the
future, so we will know when to wake up. In order to keep the
'receive-attempted' feature, returning 'now + 1' in case where the
time has already passed, but receive wasn't attempted. That will
trigger a fast wake up, so the application will be able to attempt the
receive even if there was no real events. In a correctly written
application we should not fall into this case more than once in a row.
'+ 1' ensures that we will not transition into a different state
prematurely, i.e. before the receive is actually attempted.
Fixes: 4241d652e465 ("jsonrpc: Avoid disconnecting prematurely due to long poll
intervals.")
Signed-off-by: Ilya Maximets <[email protected]>
---
CC: Wentao Jia
This might be the problem you tried to fix in northd by setting a probe
interval. Would be great, if you can test your case with this patch.
lib/reconnect.c | 29 +++++++++---
python/ovs/reconnect.py | 40 +++++++++++-----
tests/reconnect.at | 102 +++++++++++++++++++++++++++++++---------
3 files changed, 133 insertions(+), 38 deletions(-)
diff --git a/lib/reconnect.c b/lib/reconnect.c
index a929ddfd2..f6fd165e4 100644
--- a/lib/reconnect.c
+++ b/lib/reconnect.c
@@ -75,7 +75,8 @@ struct reconnect {
static void reconnect_transition__(struct reconnect *, long long int now,
enum state state);
-static long long int reconnect_deadline__(const struct reconnect *);
+static long long int reconnect_deadline__(const struct reconnect *,
+ long long int now);
static bool reconnect_may_retry(struct reconnect *);
static const char *
@@ -539,7 +540,7 @@ reconnect_transition__(struct reconnect *fsm, long long int
now,
}
static long long int
-reconnect_deadline__(const struct reconnect *fsm)
+reconnect_deadline__(const struct reconnect *fsm, long long int now)
{
ovs_assert(fsm->state_entered != LLONG_MIN);
switch (fsm->state) {
@@ -557,8 +558,18 @@ reconnect_deadline__(const struct reconnect *fsm)
if (fsm->probe_interval) {
long long int base = MAX(fsm->last_activity, fsm->state_entered);
long long int expiration = base + fsm->probe_interval;
- if (fsm->last_receive_attempt >= expiration) {
+ if (now < expiration || fsm->last_receive_attempt >= expiration) {
+ /* We still have time before the expiration or the time has
+ * already passed and there was no activity. In the first case
+ * we need to wait for the expiration, in the second - we're
+ * already past the deadline. */
return expiration;
+ } else {
+ /* Time has already passed, but we didn't attempt to receive
+ * anything. We need to wake up and try to receive even if
+ * nothing is pending, so we can update the expiration time
+ * or transition into S_IDLE state. */
+ return now + 1;
}
}
return LLONG_MAX;
@@ -566,8 +577,14 @@ reconnect_deadline__(const struct reconnect *fsm)
case S_IDLE:
if (fsm->probe_interval) {
long long int expiration = fsm->state_entered +
fsm->probe_interval;
- if (fsm->last_receive_attempt >= expiration) {
+ if (now < expiration || fsm->last_receive_attempt >= expiration) {
return expiration;
+ } else {
+ /* Time has already passed, but we didn't attempt to receive
+ * the echo reply. We need to wake up and try to receive even
+ * if nothing is pending, so we can transition into S_ACTIVE
+ * state or disconnect. */
+ return now + 1;
}
}
return LLONG_MAX;
@@ -618,7 +635,7 @@ reconnect_deadline__(const struct reconnect *fsm)
enum reconnect_action
reconnect_run(struct reconnect *fsm, long long int now)
{
- if (now >= reconnect_deadline__(fsm)) {
+ if (now >= reconnect_deadline__(fsm, now)) {
switch (fsm->state) {
case S_VOID:
return 0;
@@ -671,7 +688,7 @@ reconnect_wait(struct reconnect *fsm, long long int now)
int
reconnect_timeout(struct reconnect *fsm, long long int now)
{
- long long int deadline = reconnect_deadline__(fsm);
+ long long int deadline = reconnect_deadline__(fsm, now);
if (deadline != LLONG_MAX) {
long long int remaining = deadline - now;
return MAX(0, MIN(INT_MAX, remaining));
diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py
index c4c6c87e9..312eef1d6 100644
--- a/python/ovs/reconnect.py
+++ b/python/ovs/reconnect.py
@@ -44,7 +44,7 @@ class Reconnect(object):
is_connected = False
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
return None
@staticmethod
@@ -56,7 +56,7 @@ class Reconnect(object):
is_connected = False
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
return None
@staticmethod
@@ -68,7 +68,7 @@ class Reconnect(object):
is_connected = False
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
return fsm.state_entered + fsm.backoff
@staticmethod
@@ -80,7 +80,7 @@ class Reconnect(object):
is_connected = False
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
return fsm.state_entered + max(1000, fsm.backoff)
@staticmethod
@@ -92,13 +92,24 @@ class Reconnect(object):
is_connected = True
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
if fsm.probe_interval:
base = max(fsm.last_activity, fsm.state_entered)
expiration = base + fsm.probe_interval
- if (fsm.last_receive_attempt is None or
+ if (now < expiration or
+ fsm.last_receive_attempt is None or
fsm.last_receive_attempt >= expiration):
+ # We still have time before the expiration or the time has
+ # already passed and there was no activity. In the first
+ # case we need to wait for the expiration, in the second -
+ # we're already past the deadline. */
return expiration
+ else:
+ # Time has already passed, but we didn't attempt to receive
+ # anything. We need to wake up and try to receive even if
+ # nothing is pending, so we can update the expiration time
+ # or transition into IDLE state. */
+ return now + 1
return None
@staticmethod
@@ -114,12 +125,19 @@ class Reconnect(object):
is_connected = True
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
if fsm.probe_interval:
expiration = fsm.state_entered + fsm.probe_interval
- if (fsm.last_receive_attempt is None or
+ if (now < expiration or
+ fsm.last_receive_attempt is None or
fsm.last_receive_attempt >= expiration):
return expiration
+ else:
+ # Time has already passed, but we didn't attempt to receive
+ # the echo reply. We need to wake up and try to receive
+ # even if nothing is pending, so we can transition into
+ # ACTIVE state or disconnect. */
+ return now + 1
return None
@staticmethod
@@ -134,7 +152,7 @@ class Reconnect(object):
is_connected = False
@staticmethod
- def deadline(fsm):
+ def deadline(fsm, now):
return fsm.state_entered
@staticmethod
@@ -545,7 +563,7 @@ class Reconnect(object):
returned if the "probe interval" is nonzero--see
self.set_probe_interval())."""
- deadline = self.state.deadline(self)
+ deadline = self.state.deadline(self, now)
if deadline is not None and now >= deadline:
return self.state.run(self, now)
else:
@@ -562,7 +580,7 @@ class Reconnect(object):
"""Returns the number of milliseconds after which self.run() should be
called if nothing else notable happens in the meantime, or None if this
is currently unnecessary."""
- deadline = self.state.deadline(self)
+ deadline = self.state.deadline(self, now)
if deadline is not None:
remaining = deadline - now
return max(0, remaining)
diff --git a/tests/reconnect.at b/tests/reconnect.at
index 0f74709f5..5bca84351 100644
--- a/tests/reconnect.at
+++ b/tests/reconnect.at
@@ -39,8 +39,19 @@ run
connected
# Try timeout without noting that we tried to receive.
-# (This does nothing since we never timeout in this case.)
+# Timeout should be scheduled to the next probe interval.
timeout
+run
+
+# Once we reached the timeout, it should not expire until the receive actually
+# attempted. However, we still need to wake up as soon as possible in order to
+# have a chance to mark the receive attempt even if nothing was received.
+timeout
+run
+
+# Short time advance past the original probe interval, but not expired still.
+timeout
+run
# Now disable the receive-attempted feature and timeout again.
receive-attempted LLONG_MAX
@@ -67,18 +78,37 @@ connected
last connected 0 ms ago, connected 0 ms total
# Try timeout without noting that we tried to receive.
-# (This does nothing since we never timeout in this case.)
-timeout
- no timeout
-
-# Now disable the receive-attempted feature and timeout again.
-receive-attempted LLONG_MAX
+# Timeout should be scheduled to the next probe interval.
timeout
advance 5000 ms
### t=6000 ###
in ACTIVE for 5000 ms (0 ms backoff)
run
+
+# Once we reached the timeout, it should not expire until the receive actually
+# attempted. However, we still need to wake up as soon as possible in order to
+# have a chance to mark the receive attempt even if nothing was received.
+timeout
+ advance 1 ms
+
+### t=6001 ###
+ in ACTIVE for 5001 ms (0 ms backoff)
+run
+
+# Short time advance past the original probe interval, but not expired still.
+timeout
+ advance 1 ms
+
+### t=6002 ###
+ in ACTIVE for 5002 ms (0 ms backoff)
+run
+
+# Now disable the receive-attempted feature and timeout again.
+receive-attempted LLONG_MAX
+timeout
+ advance 0 ms
+run
should send probe
in IDLE for 0 ms (0 ms backoff)
@@ -86,7 +116,7 @@ run
timeout
advance 5000 ms
-### t=11000 ###
+### t=11002 ###
in IDLE for 5000 ms (0 ms backoff)
run
should disconnect
@@ -94,7 +124,7 @@ disconnected
in BACKOFF for 0 ms (1000 ms backoff)
1 successful connections out of 1 attempts, seqno 2
disconnected
- disconnected at 11000 ms (0 ms ago)
+ disconnected at 11002 ms (0 ms ago)
])
######################################################################
@@ -111,8 +141,19 @@ run
connected
# Try timeout without noting that we tried to receive.
-# (This does nothing since we never timeout in this case.)
+# Timeout should be scheduled to the next probe interval.
+timeout
+run
+
+# Once we reached the timeout, it should not expire until the receive actually
+# attempted. However, we still need to wake up as soon as possible in order to
+# have a chance to mark the receive attempt even if nothing was received.
+timeout
+run
+
+# Short time advance past the original probe interval, but not expired still.
timeout
+run
# Now disable the receive-attempted feature and timeout again.
receive-attempted LLONG_MAX
@@ -148,18 +189,37 @@ connected
last connected 0 ms ago, connected 0 ms total
# Try timeout without noting that we tried to receive.
-# (This does nothing since we never timeout in this case.)
-timeout
- no timeout
-
-# Now disable the receive-attempted feature and timeout again.
-receive-attempted LLONG_MAX
+# Timeout should be scheduled to the next probe interval.
timeout
advance 5000 ms
### t=6500 ###
in ACTIVE for 5000 ms (0 ms backoff)
run
+
+# Once we reached the timeout, it should not expire until the receive actually
+# attempted. However, we still need to wake up as soon as possible in order to
+# have a chance to mark the receive attempt even if nothing was received.
+timeout
+ advance 1 ms
+
+### t=6501 ###
+ in ACTIVE for 5001 ms (0 ms backoff)
+run
+
+# Short time advance past the original probe interval, but not expired still.
+timeout
+ advance 1 ms
+
+### t=6502 ###
+ in ACTIVE for 5002 ms (0 ms backoff)
+run
+
+# Now disable the receive-attempted feature and timeout again.
+receive-attempted LLONG_MAX
+timeout
+ advance 0 ms
+run
should send probe
in IDLE for 0 ms (0 ms backoff)
@@ -167,7 +227,7 @@ run
timeout
advance 5000 ms
-### t=11500 ###
+### t=11502 ###
in IDLE for 5000 ms (0 ms backoff)
run
should disconnect
@@ -175,7 +235,7 @@ disconnected
in BACKOFF for 0 ms (1000 ms backoff)
1 successful connections out of 1 attempts, seqno 2
disconnected
- disconnected at 11500 ms (0 ms ago)
+ disconnected at 11502 ms (0 ms ago)
])
######################################################################
@@ -1271,14 +1331,14 @@ activity
created 1000, last activity 3000, last connected 2000
# Connection times out.
-timeout
- no timeout
-receive-attempted LLONG_MAX
timeout
advance 5000 ms
### t=8000 ###
in ACTIVE for 6000 ms (1000 ms backoff)
+receive-attempted LLONG_MAX
+timeout
+ advance 0 ms
run
should send probe
in IDLE for 0 ms (1000 ms backoff)
--
2.34.1
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev