On 5/25/23 13:40, Ales Musil wrote:
> 
> 
> On Thu, May 25, 2023 at 10:37 AM Dumitru Ceara <[email protected]
> <mailto:[email protected]>> wrote:
> 
>     Whenever an OpenFlow error is returned by OvS, trigger a reconnect of
>     the OpenFlow (rconn) connection.  This will clear any installed OpenFlow
>     rules/groups. To ensure consistency, trigger a full I-P recompute too.
> 
>     An example of scenario that can result in an OpenFlow error returned by
>     OvS follows (describing two main loop iterations in ovn-controller):
>       - Iteration I:
>         a. get updates from SB
>         b. process these updates and generate "desired" openflows (lets
>     assume
>         this generates quite a lot of desired openflow modifications)
>         c.1. add bundle-open msg to rconn
>         c.2. add openflow mod msgs to rconn (only some of these make it
>     through,
>         the rest gets queued, the rconn is backlogged at this point).
>         c.3. add bundle-commit msg to rconn (this gets queued)
> 
>       - Iteration II:
>         a. get updates from SB (rconn is still backlogged)
>         b. process the updates and generate "desired" openflows (lets assume
>         this takes 10+ seconds for the specific SB updates)
> 
>     At some point, while step II.b was being executed OvS declared the
>     bundle
>     operation (started at I.c.1) timeout.  We now act on this error by
>     reconnecting which in turn triggers a flush of the rconn backlog and
>     gives more chance to the next full recompute to succeed in installing
>     all flows.
> 
>     Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2134880
>     <https://bugzilla.redhat.com/show_bug.cgi?id=2134880>
>     Reported-by: François Rigault <[email protected]
>     <mailto:[email protected]>>
>     CC: Ilya Maximets <[email protected] <mailto:[email protected]>>
>     Signed-off-by: Dumitru Ceara <[email protected]
>     <mailto:[email protected]>>
> 
> 
> Hi Dumitru,
> 
> sorry for being nitpicky, please see my comment below.
>  
> 
>     ---
>      controller/ofctrl.c         | 17 ++++++++++++++---
>      controller/ofctrl.h         |  2 +-
>      controller/ovn-controller.c | 10 ++++++++--
>      3 files changed, 23 insertions(+), 6 deletions(-)
> 
>     diff --git a/controller/ofctrl.c b/controller/ofctrl.c
>     index b1ba1c743a..1da23bc27e 100644
>     --- a/controller/ofctrl.c
>     +++ b/controller/ofctrl.c
>     @@ -766,13 +766,18 @@ ofctrl_get_mf_field_id(void)
> 
>      /* Runs the OpenFlow state machine against 'br_int', which is local
>     to the
>       * hypervisor on which we are running.  Attempts to negotiate a
>     Geneve option
>     - * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE. */
>     -void
>     + * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE.
>     + *
>     + * Returns 'true' if an OpenFlow reconnect happened; 'false' otherwise.
>     + */
>     +bool
>      ofctrl_run(const struct ovsrec_bridge *br_int,
>                 const struct ovsrec_open_vswitch_table *ovs_table,
>                 struct shash *pending_ct_zones)
>      {
>          char *target = xasprintf("unix:%s/%s.mgmt", ovs_rundir(),
>     br_int->name);
>     +    bool reconnected = false;
>     +
>          if (strcmp(target, rconn_get_target(swconn))) {
>              VLOG_INFO("%s: connecting to switch", target);
>              rconn_connect(swconn, target, target);
>     @@ -782,10 +787,12 @@ ofctrl_run(const struct ovsrec_bridge *br_int,
>          rconn_run(swconn);
> 
>          if (!rconn_is_connected(swconn)) {
>     -        return;
>     +        goto done;
> 
> 
> We don't have to introduce the "done" label. AFIACT you can just return
> the "reconnected" right here with the same effect.
>  
> 
>          }
>     +
>          if (seqno != rconn_get_connection_seqno(swconn)) {
>              seqno = rconn_get_connection_seqno(swconn);
>     +        reconnected = true;
>              state = S_NEW;
> 
>              /* Reset the state of any outstanding ct flushes to resend
>     them. */
>     @@ -855,6 +862,9 @@ ofctrl_run(const struct ovsrec_bridge *br_int,
>               * point, so ensure that we come back again without waiting. */
>              poll_immediate_wake();
>          }
>     +
>     +done:
>     +    return reconnected;
>      }
> 
>      void
>     @@ -909,6 +919,7 @@ ofctrl_recv(const struct ofp_header *oh, enum
>     ofptype type)
>          } else if (type == OFPTYPE_ERROR) {
>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
>     300);
>              log_openflow_rl(&rl, VLL_INFO, oh, "OpenFlow error");
>     +        rconn_reconnect(swconn);
>          } else {
>              static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
>     300);
>              log_openflow_rl(&rl, VLL_DBG, oh, "OpenFlow packet ignored");
>     diff --git a/controller/ofctrl.h b/controller/ofctrl.h
>     index f5751e3ee4..105f9370be 100644
>     --- a/controller/ofctrl.h
>     +++ b/controller/ofctrl.h
>     @@ -51,7 +51,7 @@ struct ovn_desired_flow_table {
>      void ofctrl_init(struct ovn_extend_table *group_table,
>                       struct ovn_extend_table *meter_table,
>                       int inactivity_probe_interval);
>     -void ofctrl_run(const struct ovsrec_bridge *br_int,
>     +bool ofctrl_run(const struct ovsrec_bridge *br_int,
>                      const struct ovsrec_open_vswitch_table *,
>                      struct shash *pending_ct_zones);
>      enum mf_field_id ofctrl_get_mf_field_id(void);
>     diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
>     index 1151d36644..b301c50157 100644
>     --- a/controller/ovn-controller.c
>     +++ b/controller/ovn-controller.c
>     @@ -5075,8 +5075,14 @@ main(int argc, char *argv[])
> 
>                  if (br_int) {
>                      ct_zones_data = engine_get_data(&en_ct_zones);
>     -                if (ct_zones_data) {
>     -                    ofctrl_run(br_int, ovs_table,
>     &ct_zones_data->pending);
>     +                if (ct_zones_data && ofctrl_run(br_int, ovs_table,
>     +                                               
>     &ct_zones_data->pending)) {
>     +                    static struct vlog_rate_limit rl
>     +                            = VLOG_RATE_LIMIT_INIT(1, 1);
>     +
>     +                    VLOG_INFO_RL(&rl, "OVS OpenFlow connection
>     reconnected,"
>     +                                      "force recompute.");
>     +                    engine_set_force_recompute(true);
>                      }
> 
>                      if (chassis) {
>     -- 
>     2.31.1
> 
>     _______________________________________________
>     dev mailing list
>     [email protected] <mailto:[email protected]>
>     https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>     <https://mail.openvswitch.org/mailman/listinfo/ovs-dev>
> 
> 
> 
> I suppose the comment can be addressed during merge, with that being
> said:
> 
> Reviewed-by: Ales Musil <[email protected] <mailto:[email protected]>>
> 

Thanks, Ales and Simon, for the reviews!

I folded Ales' suggestion in and pushed this patch to the main branch.
I also backported this to all branches down to 22.03.

Regards,
Dumitru

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to