jtavares has uploaded this change for review. ( 
https://gerrit.osmocom.org/c/osmo-remsim/+/30138 )


Change subject: rspro_client: implement re-establish delay
......................................................................

rspro_client: implement re-establish delay

- add new SRVC_ST_REESTABLISH_DELAY state with delay stipulated by table
k_reestablish_delay_s[], that implements a simple exponential-like back-off
with an upper bound.
- new function srvc_do_reestablish() is used to initiate a reestablish, and
apply the appropriate delay, if any.
- takes external delays (such as TCP connect() delay) into account, and does
not double-penalize.
- delay is reset to shortest possible if there has been no reestablish
initiated in a long time (2x greater than our longest delay). Allows for
fast reconnects even if a delay was used to connect.
- addresses issues https://osmocom.org/issues/5348 and 
https://osmocom.org/issues/5610

Change-Id: I86cdc3ba37482e6577b429194d273a2399f32208
---
M src/rspro_client_fsm.c
M src/rspro_client_fsm.h
2 files changed, 118 insertions(+), 12 deletions(-)



  git pull ssh://gerrit.osmocom.org:29418/osmo-remsim refs/changes/38/30138/1

diff --git a/src/rspro_client_fsm.c b/src/rspro_client_fsm.c
index bd267ca..da39a09 100644
--- a/src/rspro_client_fsm.c
+++ b/src/rspro_client_fsm.c
@@ -20,6 +20,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <errno.h>
+#include <unistd.h>

 #include <talloc.h>

@@ -40,6 +41,25 @@
 #define T1_WAIT_CLIENT_CONN_RES                10
 #define T2_RECONNECT                   10

+static const int k_reestablish_delay_s[] = {
+       0, 0, 0,                            // 3 immediate retries
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       // 1 Hz for 30 seconds
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       // 1/2 hz for 1 minute
+       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,       // 1/4 Hz for 2 minutes
+       8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+       8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+       8, 8, 8, 8, 8, 8, 8, 8, 8, 8,       // 1/8 Hz for 4 minutes
+       16,                                 // 1/16 Hz thereafter
+};
+
+#define REESTABLISH_DELAY_COUNT 
sizeof(k_reestablish_delay_s)/sizeof(k_reestablish_delay_s[0])
+
 /***********************************************************************
  * client-side FSM for a RSPRO connection to remsim-server
  *
@@ -101,7 +121,9 @@
        SRVC_ST_ESTABLISHED,
        /* server connection established, ClientConnect succeeded */
        SRVC_ST_CONNECTED,
-       /* connection lost, we're waiting for a re-establish */
+       /* connection lost, 1st step: delaying until we re-establish */
+       SRVC_ST_REESTABLISH_DELAY,
+       /* connection lost, 2nd step: wait for a re-establish */
        SRVC_ST_REESTABLISH,
 };

@@ -191,6 +213,46 @@
        .wait_for_resp = 10,
 };

+static int64_t get_monotonic_ms()
+{
+    struct timespec t;
+    clock_gettime(CLOCK_BOOTTIME, &t);
+    return ((1000LL * t.tv_sec) + (t.tv_nsec / 1000000));
+}
+
+static void srvc_do_reestablish(struct osmo_fsm_inst *fi)
+{
+       struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv;
+
+       const int64_t since_last_ms = get_monotonic_ms() - 
srvc->reestablish_last_ms;
+
+       /* reset delay loop if it has been > 2x the longest timeout since our 
last attempt;
+        * this lets us revert to rapid reconnect behavior for a good 
connection */
+       const int64_t reset_ms = 
2*1000*(OSMO_MAX(OSMO_MAX(T1_WAIT_CLIENT_CONN_RES, T2_RECONNECT),
+               k_reestablish_delay_s[REESTABLISH_DELAY_COUNT-1]));
+
+       if (since_last_ms > reset_ms) {
+               srvc->reestablish_delay_idx = 0;
+               LOGPFSML(fi, LOGL_NOTICE, "->REESTABLISH_DELAY reset; %" PRId64 
"ms since last attempt\n",
+                       since_last_ms);
+       }
+
+       /* determine if we need to delay reestablishment */
+       const int64_t need_ms = 
k_reestablish_delay_s[srvc->reestablish_delay_idx] * 1000;
+       int64_t delay_ms = need_ms - since_last_ms;
+
+       if (delay_ms > 0) {
+               LOGPFSML(fi, LOGL_NOTICE, "->REESTABLISH_DELAY delay %" PRId64 
"ms; %" PRId64 "ms since last attempt [step %zu/%zu@%ds]\n",
+                       delay_ms, since_last_ms, srvc->reestablish_delay_idx, 
(REESTABLISH_DELAY_COUNT-1),
+                       k_reestablish_delay_s[srvc->reestablish_delay_idx]);
+       } else {
+               /* cheat and always use a minimum delay of 1ms to ensure a fsm 
timeout is triggered */
+               delay_ms = 1;
+       }
+
+       osmo_fsm_inst_state_chg_ms(fi, SRVC_ST_REESTABLISH_DELAY, delay_ms, 3);
+}
+
 static void srvc_st_init(struct osmo_fsm_inst *fi, uint32_t event, void *data)
 {
        switch (event) {
@@ -224,7 +286,7 @@
        switch (event) {
        case SRVC_E_TCP_DOWN:
        case SRVC_E_KA_TIMEOUT:
-               osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 
2);
+               srvc_do_reestablish(fi);
                break;
        case SRVC_E_CLIENT_CONN_RES:
                pdu = data;
@@ -260,7 +322,7 @@
        switch (event) {
        case SRVC_E_TCP_DOWN:
        case SRVC_E_KA_TIMEOUT:
-               osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 
2);
+               srvc_do_reestablish(fi);
                break;
        case SRVC_E_RSPRO_TX:
                pdu = data;
@@ -286,10 +348,9 @@
        return 0; /* we will explicitly terminate it */
 }

-static void srvc_st_reestablish_onenter(struct osmo_fsm_inst *fi, uint32_t 
prev_state)
+static void srvc_st_reestablish_delay_onenter(struct osmo_fsm_inst *fi, 
uint32_t prev_state)
 {
        struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv;
-       int rc;

        if (srvc->keepalive_fi) {
                ipa_keepalive_fsm_stop(srvc->keepalive_fi);
@@ -303,6 +364,27 @@
                ipa_client_conn_destroy(srvc->conn);
                srvc->conn = NULL;
        }
+
+       /* saturate timeout at last (longest) entry */
+       if (srvc->reestablish_delay_idx < REESTABLISH_DELAY_COUNT-1) {
+               srvc->reestablish_delay_idx++;
+       }
+}
+
+static void srvc_st_reestablish_delay(struct osmo_fsm_inst *fi, uint32_t 
event, void *data)
+{
+       switch (event) {
+       default:
+               OSMO_ASSERT(0);
+       }
+}
+static void srvc_st_reestablish_onenter(struct osmo_fsm_inst *fi, uint32_t 
prev_state)
+{
+       struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv;
+       int rc;
+
+       srvc->reestablish_last_ms = get_monotonic_ms();
+
        LOGPFSML(fi, LOGL_INFO, "Creating TCP connection to server at %s:%u\n",
                 srvc->server_host, srvc->server_port);
        srvc->conn = ipa_client_conn_create2(fi, NULL, 0, NULL, 0, 
srvc->server_host, srvc->server_port,
@@ -351,7 +433,10 @@

        switch (event) {
        case SRVC_E_ESTABLISH:
-               osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 
2);
+               /* reset delay connect immediately on our first connection */
+               srvc->reestablish_delay_idx = 0;
+               srvc->reestablish_last_ms = 0;
+               srvc_do_reestablish(fi);
                break;
        case SRVC_E_DISCONNECT:
                if (srvc->keepalive_fi) {
@@ -377,10 +462,14 @@
        struct rspro_server_conn *srvc = (struct rspro_server_conn *) fi->priv;

        switch (fi->T) {
-       case 2:
-               /* TCP reconnect failed: retry */
+       case 3:
+               /* delay has expired; let's re-establish */
                osmo_fsm_inst_state_chg(fi, SRVC_ST_REESTABLISH, T2_RECONNECT, 
2);
                break;
+       case 2:
+               /* TCP reconnect failed: retry after wait */
+               srvc_do_reestablish(fi);
+               break;
        case 1:
                /* no ClientConnectRes received: disconnect + reconnect */
                ipa_client_conn_close(srvc->conn);
@@ -397,28 +486,35 @@
        [SRVC_ST_INIT] = {
                .name = "INIT",
                .in_event_mask = 0, /* S(SRVC_E_ESTABLISH) via allstate */
-               .out_state_mask = S(SRVC_ST_INIT) | S(SRVC_ST_REESTABLISH),
+               .out_state_mask = S(SRVC_ST_INIT) | 
S(SRVC_ST_REESTABLISH_DELAY),
                .action = srvc_st_init,
        },
        [SRVC_ST_ESTABLISHED] = {
                .name = "ESTABLISHED",
                .in_event_mask = S(SRVC_E_TCP_DOWN) | S(SRVC_E_KA_TIMEOUT) | 
S(SRVC_E_CLIENT_CONN_RES),
-               .out_state_mask = S(SRVC_ST_CONNECTED) | S(SRVC_ST_REESTABLISH) 
| S(SRVC_ST_INIT),
+               .out_state_mask = S(SRVC_ST_CONNECTED) | 
S(SRVC_ST_REESTABLISH_DELAY) | S(SRVC_ST_INIT),
                .action = srvc_st_established,
                .onenter = srvc_st_established_onenter,
        },
        [SRVC_ST_CONNECTED] = {
                .name = "CONNECTED",
                .in_event_mask = S(SRVC_E_TCP_DOWN) | S(SRVC_E_KA_TIMEOUT) | 
S(SRVC_E_RSPRO_TX),
-               .out_state_mask = S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT),
+               .out_state_mask = S(SRVC_ST_REESTABLISH_DELAY) | 
S(SRVC_ST_INIT),
                .action = srvc_st_connected,
                .onenter = srvc_st_connected_onenter,
                .onleave = srvc_st_connected_onleave,
        },
+       [SRVC_ST_REESTABLISH_DELAY] = {
+               .name = "REESTABLISH_DELAY",
+               .in_event_mask = 0,
+               .out_state_mask = S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT),
+               .action = srvc_st_reestablish_delay,
+               .onenter = srvc_st_reestablish_delay_onenter,
+       },
        [SRVC_ST_REESTABLISH] = {
                .name = "REESTABLISH",
                .in_event_mask = S(SRVC_E_TCP_UP) | S(SRVC_E_TCP_DOWN),
-               .out_state_mask = S(SRVC_ST_ESTABLISHED) | 
S(SRVC_ST_REESTABLISH) | S(SRVC_ST_INIT),
+               .out_state_mask = S(SRVC_ST_ESTABLISHED) | 
S(SRVC_ST_REESTABLISH_DELAY) | S(SRVC_ST_INIT),
                .action = srvc_st_reestablish,
                .onenter = srvc_st_reestablish_onenter,
        },
@@ -444,6 +540,9 @@
                return -1;

        srvc->fi = fi;
+       srvc->reestablish_delay_idx = 0;
+       srvc->reestablish_last_ms = 0;
+
        return 0;
 }

diff --git a/src/rspro_client_fsm.h b/src/rspro_client_fsm.h
index 029fcd9..55fe4a4 100644
--- a/src/rspro_client_fsm.h
+++ b/src/rspro_client_fsm.h
@@ -26,6 +26,13 @@
        struct osmo_fsm_inst *fi;
        struct osmo_fsm_inst *keepalive_fi;
        int (*handle_rx)(struct rspro_server_conn *conn, const RsproPDU_t *pdu);
+
+       /* index into k_reestablish_delay[] for this connection */
+       size_t reestablish_delay_idx;
+
+       /* timestamp of last re-establish attempt, in milliseconds */
+       int64_t reestablish_last_ms;
+
        /* IPA protocol identity */
        struct ipaccess_unit ipa_dev;


--
To view, visit https://gerrit.osmocom.org/c/osmo-remsim/+/30138
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.osmocom.org/settings

Gerrit-Project: osmo-remsim
Gerrit-Branch: master
Gerrit-Change-Id: I86cdc3ba37482e6577b429194d273a2399f32208
Gerrit-Change-Number: 30138
Gerrit-PatchSet: 1
Gerrit-Owner: jtavares <[email protected]>
Gerrit-MessageType: newchange

Reply via email to