On 2/21/22 15:16, Ilya Maximets wrote:
> The purpose of reconnect_deadline__() function is twofold:
>
> 1. Its result is used to tell if the state has to be changed right now
> in reconnect_run().
> 2. Its result also used to determine when the process need to wake up
> and call reconnect_run() for a next time, i.e. when the state may
> need to be changed next time.
>
> Since introduction of the 'receive-attempted' feature, the function
> returns LLONG_MAX if the deadline is in the future. That works for
> the first case, but doesn't for the second one, because we don't
> really know when we need to call reconnect_run().
>
> This is the problem for applications where jsonrpc connection is the
> only source of wake ups, e.g. ovn-northd. When the network goes down
> silently, e.g. server looses IP address due to DHCP failure, ovn-northd
> will sleep in the poll loop indefinitely after being told that it
> doesn't need to call reconnect_run() (deadline == LLONG_MAX).
>
> Fixing that by actually returning the expected time if it is in the
> future, so we will know when to wake up. In order to keep the
> 'receive-attempted' feature, returning 'now + 1' in case where the
> time has already passed, but receive wasn't attempted. That will
> trigger a fast wake up, so the application will be able to attempt the
> receive even if there was no real events. In a correctly written
> application we should not fall into this case more than once in a row.
> '+ 1' ensures that we will not transition into a different state
> prematurely, i.e. before the receive is actually attempted.
>
> Fixes: 4241d652e465 ("jsonrpc: Avoid disconnecting prematurely due to long
> poll intervals.")
> Signed-off-by: Ilya Maximets <[email protected]>
> ---
Hi Ilya,
This change looks good to me, I just have a couple minor comment-related
remarks; in any case:
Acked-by: Dumitru Ceara <[email protected]>
Regards,
Dumitru
>
> CC: Wentao Jia
> This might be the problem you tried to fix in northd by setting a probe
> interval. Would be great, if you can test your case with this patch.
>
> lib/reconnect.c | 29 +++++++++---
> python/ovs/reconnect.py | 40 +++++++++++-----
> tests/reconnect.at | 102 +++++++++++++++++++++++++++++++---------
> 3 files changed, 133 insertions(+), 38 deletions(-)
>
> diff --git a/lib/reconnect.c b/lib/reconnect.c
> index a929ddfd2..f6fd165e4 100644
> --- a/lib/reconnect.c
> +++ b/lib/reconnect.c
> @@ -75,7 +75,8 @@ struct reconnect {
>
> static void reconnect_transition__(struct reconnect *, long long int now,
> enum state state);
> -static long long int reconnect_deadline__(const struct reconnect *);
> +static long long int reconnect_deadline__(const struct reconnect *,
> + long long int now);
> static bool reconnect_may_retry(struct reconnect *);
>
> static const char *
> @@ -539,7 +540,7 @@ reconnect_transition__(struct reconnect *fsm, long long
> int now,
> }
>
> static long long int
> -reconnect_deadline__(const struct reconnect *fsm)
> +reconnect_deadline__(const struct reconnect *fsm, long long int now)
> {
> ovs_assert(fsm->state_entered != LLONG_MIN);
> switch (fsm->state) {
> @@ -557,8 +558,18 @@ reconnect_deadline__(const struct reconnect *fsm)
> if (fsm->probe_interval) {
> long long int base = MAX(fsm->last_activity, fsm->state_entered);
> long long int expiration = base + fsm->probe_interval;
> - if (fsm->last_receive_attempt >= expiration) {
> + if (now < expiration || fsm->last_receive_attempt >= expiration)
> {
> + /* We still have time before the expiration or the time has
> + * already passed and there was no activity. In the first
> case
> + * we need to wait for the expiration, in the second - we're
> + * already past the deadline. */
> return expiration;
> + } else {
> + /* Time has already passed, but we didn't attempt to receive
> + * anything. We need to wake up and try to receive even if
> + * nothing is pending, so we can update the expiration time
> + * or transition into S_IDLE state. */
> + return now + 1;
> }
> }
> return LLONG_MAX;
> @@ -566,8 +577,14 @@ reconnect_deadline__(const struct reconnect *fsm)
> case S_IDLE:
> if (fsm->probe_interval) {
> long long int expiration = fsm->state_entered +
> fsm->probe_interval;
> - if (fsm->last_receive_attempt >= expiration) {
> + if (now < expiration || fsm->last_receive_attempt >= expiration)
> {
Should we duplicate or add a similar comment to what we have for S_ACTIVE?
> return expiration;
> + } else {
> + /* Time has already passed, but we didn't attempt to receive
> + * the echo reply. We need to wake up and try to receive
> even
> + * if nothing is pending, so we can transition into S_ACTIVE
> + * state or disconnect. */
> + return now + 1;
> }
> }
> return LLONG_MAX;
> @@ -618,7 +635,7 @@ reconnect_deadline__(const struct reconnect *fsm)
> enum reconnect_action
> reconnect_run(struct reconnect *fsm, long long int now)
> {
> - if (now >= reconnect_deadline__(fsm)) {
> + if (now >= reconnect_deadline__(fsm, now)) {
> switch (fsm->state) {
> case S_VOID:
> return 0;
> @@ -671,7 +688,7 @@ reconnect_wait(struct reconnect *fsm, long long int now)
> int
> reconnect_timeout(struct reconnect *fsm, long long int now)
> {
> - long long int deadline = reconnect_deadline__(fsm);
> + long long int deadline = reconnect_deadline__(fsm, now);
> if (deadline != LLONG_MAX) {
> long long int remaining = deadline - now;
> return MAX(0, MIN(INT_MAX, remaining));
> diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py
> index c4c6c87e9..312eef1d6 100644
> --- a/python/ovs/reconnect.py
> +++ b/python/ovs/reconnect.py
> @@ -44,7 +44,7 @@ class Reconnect(object):
> is_connected = False
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> return None
>
> @staticmethod
> @@ -56,7 +56,7 @@ class Reconnect(object):
> is_connected = False
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> return None
>
> @staticmethod
> @@ -68,7 +68,7 @@ class Reconnect(object):
> is_connected = False
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> return fsm.state_entered + fsm.backoff
>
> @staticmethod
> @@ -80,7 +80,7 @@ class Reconnect(object):
> is_connected = False
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> return fsm.state_entered + max(1000, fsm.backoff)
>
> @staticmethod
> @@ -92,13 +92,24 @@ class Reconnect(object):
> is_connected = True
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> if fsm.probe_interval:
> base = max(fsm.last_activity, fsm.state_entered)
> expiration = base + fsm.probe_interval
> - if (fsm.last_receive_attempt is None or
> + if (now < expiration or
> + fsm.last_receive_attempt is None or
> fsm.last_receive_attempt >= expiration):
> + # We still have time before the expiration or the time
> has
> + # already passed and there was no activity. In the first
> + # case we need to wait for the expiration, in the second
> -
> + # we're already past the deadline. */
> return expiration
> + else:
> + # Time has already passed, but we didn't attempt to
> receive
> + # anything. We need to wake up and try to receive even
> if
> + # nothing is pending, so we can update the expiration
> time
> + # or transition into IDLE state. */
> + return now + 1
> return None
>
> @staticmethod
> @@ -114,12 +125,19 @@ class Reconnect(object):
> is_connected = True
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> if fsm.probe_interval:
> expiration = fsm.state_entered + fsm.probe_interval
> - if (fsm.last_receive_attempt is None or
> + if (now < expiration or
> + fsm.last_receive_attempt is None or
> fsm.last_receive_attempt >= expiration):
Same question about the comment here too.
> return expiration
> + else:
> + # Time has already passed, but we didn't attempt to
> receive
> + # the echo reply. We need to wake up and try to receive
> + # even if nothing is pending, so we can transition into
> + # ACTIVE state or disconnect. */
> + return now + 1
> return None
>
> @staticmethod
> @@ -134,7 +152,7 @@ class Reconnect(object):
> is_connected = False
>
> @staticmethod
> - def deadline(fsm):
> + def deadline(fsm, now):
> return fsm.state_entered
>
> @staticmethod
> @@ -545,7 +563,7 @@ class Reconnect(object):
> returned if the "probe interval" is nonzero--see
> self.set_probe_interval())."""
>
> - deadline = self.state.deadline(self)
> + deadline = self.state.deadline(self, now)
> if deadline is not None and now >= deadline:
> return self.state.run(self, now)
> else:
> @@ -562,7 +580,7 @@ class Reconnect(object):
> """Returns the number of milliseconds after which self.run() should
> be
> called if nothing else notable happens in the meantime, or None if
> this
> is currently unnecessary."""
> - deadline = self.state.deadline(self)
> + deadline = self.state.deadline(self, now)
> if deadline is not None:
> remaining = deadline - now
> return max(0, remaining)
> diff --git a/tests/reconnect.at b/tests/reconnect.at
> index 0f74709f5..5bca84351 100644
> --- a/tests/reconnect.at
> +++ b/tests/reconnect.at
> @@ -39,8 +39,19 @@ run
> connected
>
> # Try timeout without noting that we tried to receive.
> -# (This does nothing since we never timeout in this case.)
> +# Timeout should be scheduled to the next probe interval.
> timeout
> +run
> +
> +# Once we reached the timeout, it should not expire until the receive
> actually
> +# attempted. However, we still need to wake up as soon as possible in order
> to
> +# have a chance to mark the receive attempt even if nothing was received.
> +timeout
> +run
> +
> +# Short time advance past the original probe interval, but not expired still.
> +timeout
> +run
>
> # Now disable the receive-attempted feature and timeout again.
> receive-attempted LLONG_MAX
> @@ -67,18 +78,37 @@ connected
> last connected 0 ms ago, connected 0 ms total
>
> # Try timeout without noting that we tried to receive.
> -# (This does nothing since we never timeout in this case.)
> -timeout
> - no timeout
> -
> -# Now disable the receive-attempted feature and timeout again.
> -receive-attempted LLONG_MAX
> +# Timeout should be scheduled to the next probe interval.
> timeout
> advance 5000 ms
>
> ### t=6000 ###
> in ACTIVE for 5000 ms (0 ms backoff)
> run
> +
> +# Once we reached the timeout, it should not expire until the receive
> actually
> +# attempted. However, we still need to wake up as soon as possible in order
> to
> +# have a chance to mark the receive attempt even if nothing was received.
> +timeout
> + advance 1 ms
> +
> +### t=6001 ###
> + in ACTIVE for 5001 ms (0 ms backoff)
> +run
> +
> +# Short time advance past the original probe interval, but not expired still.
> +timeout
> + advance 1 ms
> +
> +### t=6002 ###
> + in ACTIVE for 5002 ms (0 ms backoff)
> +run
> +
> +# Now disable the receive-attempted feature and timeout again.
> +receive-attempted LLONG_MAX
> +timeout
> + advance 0 ms
> +run
> should send probe
> in IDLE for 0 ms (0 ms backoff)
>
> @@ -86,7 +116,7 @@ run
> timeout
> advance 5000 ms
>
> -### t=11000 ###
> +### t=11002 ###
> in IDLE for 5000 ms (0 ms backoff)
> run
> should disconnect
> @@ -94,7 +124,7 @@ disconnected
> in BACKOFF for 0 ms (1000 ms backoff)
> 1 successful connections out of 1 attempts, seqno 2
> disconnected
> - disconnected at 11000 ms (0 ms ago)
> + disconnected at 11002 ms (0 ms ago)
> ])
>
> ######################################################################
> @@ -111,8 +141,19 @@ run
> connected
>
> # Try timeout without noting that we tried to receive.
> -# (This does nothing since we never timeout in this case.)
> +# Timeout should be scheduled to the next probe interval.
> +timeout
> +run
> +
> +# Once we reached the timeout, it should not expire until the receive
> actually
> +# attempted. However, we still need to wake up as soon as possible in order
> to
> +# have a chance to mark the receive attempt even if nothing was received.
> +timeout
> +run
> +
> +# Short time advance past the original probe interval, but not expired still.
> timeout
> +run
>
> # Now disable the receive-attempted feature and timeout again.
> receive-attempted LLONG_MAX
> @@ -148,18 +189,37 @@ connected
> last connected 0 ms ago, connected 0 ms total
>
> # Try timeout without noting that we tried to receive.
> -# (This does nothing since we never timeout in this case.)
> -timeout
> - no timeout
> -
> -# Now disable the receive-attempted feature and timeout again.
> -receive-attempted LLONG_MAX
> +# Timeout should be scheduled to the next probe interval.
> timeout
> advance 5000 ms
>
> ### t=6500 ###
> in ACTIVE for 5000 ms (0 ms backoff)
> run
> +
> +# Once we reached the timeout, it should not expire until the receive
> actually
> +# attempted. However, we still need to wake up as soon as possible in order
> to
> +# have a chance to mark the receive attempt even if nothing was received.
> +timeout
> + advance 1 ms
> +
> +### t=6501 ###
> + in ACTIVE for 5001 ms (0 ms backoff)
> +run
> +
> +# Short time advance past the original probe interval, but not expired still.
> +timeout
> + advance 1 ms
> +
> +### t=6502 ###
> + in ACTIVE for 5002 ms (0 ms backoff)
> +run
> +
> +# Now disable the receive-attempted feature and timeout again.
> +receive-attempted LLONG_MAX
> +timeout
> + advance 0 ms
> +run
> should send probe
> in IDLE for 0 ms (0 ms backoff)
>
> @@ -167,7 +227,7 @@ run
> timeout
> advance 5000 ms
>
> -### t=11500 ###
> +### t=11502 ###
> in IDLE for 5000 ms (0 ms backoff)
> run
> should disconnect
> @@ -175,7 +235,7 @@ disconnected
> in BACKOFF for 0 ms (1000 ms backoff)
> 1 successful connections out of 1 attempts, seqno 2
> disconnected
> - disconnected at 11500 ms (0 ms ago)
> + disconnected at 11502 ms (0 ms ago)
> ])
>
> ######################################################################
> @@ -1271,14 +1331,14 @@ activity
> created 1000, last activity 3000, last connected 2000
>
> # Connection times out.
> -timeout
> - no timeout
> -receive-attempted LLONG_MAX
> timeout
> advance 5000 ms
>
> ### t=8000 ###
> in ACTIVE for 6000 ms (1000 ms backoff)
> +receive-attempted LLONG_MAX
> +timeout
> + advance 0 ms
> run
> should send probe
> in IDLE for 0 ms (1000 ms backoff)
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev