I have merged this to master now.

On 5/4/20 12:18 PM, Han Zhou wrote:
Acked-by: Han Zhou <[email protected] <mailto:[email protected]>>

On Fri, May 1, 2020 at 12:15 PM Mark Michelson <[email protected] <mailto:[email protected]>> wrote:
 >
 > This patch is dependent on OVS commit
> https://patchwork.ozlabs.org/project/openvswitch/patch/[email protected]/
 >
 > Please do not merge this until the OVS commit has been merged as well.
 >
 > On 5/1/20 3:13 PM, Mark Michelson wrote:
 > > During the course of debugging a clustered DB environment, all members
 > > of the southbound database cluster were destroyed (i.e. the .db files
 > > were removed from disk) and then restarted. Once this happened,
 > > ovn-northd and ovn-controller could not interact with the southbound
> > database because they both detected all members of the cluster as having
 > > "stale" data. The only course of action was to reset ovn-northd and all
 > > ovn-controllers. It is possible to have this happen with the northbound
 > > database as well if it is clustered.
 > >
 > > This patch offers new ovn-appctl commands for ovn-northd and
 > > ovn-controller that allows for it to reset its clustered status. This
> > allows for it to interact with the database successfully after a cluster
 > > teardown and restart.
 > >
> > Signed-off-by: Mark Michelson <[email protected] <mailto:[email protected]>>
 > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109
 > > ---
 > >   controller/ovn-controller.8.xml | 16 ++++++++++++++++
 > >   controller/ovn-controller.c     | 30 ++++++++++++++++++++++++++---
 > >   northd/ovn-northd.8.xml         | 28 +++++++++++++++++++++++++++
> >   northd/ovn-northd.c             | 34 +++++++++++++++++++++++++++++++++
 > >   4 files changed, 105 insertions(+), 3 deletions(-)
 > >
> > diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
 > > index 76bbbdc5f..fe62163fa 100644
 > > --- a/controller/ovn-controller.8.xml
 > > +++ b/controller/ovn-controller.8.xml
 > > @@ -491,6 +491,22 @@
 > >           recomputes are cpu intensive.
 > >         </p>
 > >         </dd>
 > > +
 > > +      <dt><code>sb-cluster-state-reset</code></dt>
 > > +      <dd>
 > > +      <p>
> > +        Reset southbound database cluster status when databases are destroyed
 > > +        and rebuilt.
 > > +      </p>
 > > +      <p>
> > +        If all databases in a clustered southbound database are removed from > > +        disk, then the stored index of all databases will be reset to zero. > > +        This will cause ovn-controller to be unable to read or write to the > > +        southbound database, because it will always detect the data as stale. > > +        In such a case, run this command so that ovn-controller will reset its > > +        local index so that it can interact with the southbound database again.
 > > +      </p>
 > > +      </dd>
 > >         </dl>
 > >       </p>
 > >
 > > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
 > > index 6ff897325..1442accd7 100644
 > > --- a/controller/ovn-controller.c
 > > +++ b/controller/ovn-controller.c
 > > @@ -73,6 +73,7 @@ static unixctl_cb_func extend_table_list;
 > >   static unixctl_cb_func inject_pkt;
 > >   static unixctl_cb_func ovn_controller_conn_show;
 > >   static unixctl_cb_func engine_recompute_cmd;
 > > +static unixctl_cb_func cluster_state_reset_cmd;
 > >
 > >   #define DEFAULT_BRIDGE_NAME "br-int"
 > >   #define DEFAULT_PROBE_INTERVAL_MSEC 5000
> > @@ -446,7 +447,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl)
 > >    * updates 'sbdb_idl' with that pointer. */
 > >   static void
 > >   update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
 > > -             bool *monitor_all_p)
 > > +             bool *monitor_all_p, bool *reset_ovnsb_idl_min_index)
 > >   {
> >       const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(ovs_idl);
 > >       if (!cfg) {
> > @@ -476,6 +477,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
 > >       if (monitor_all_p) {
 > >           *monitor_all_p = monitor_all;
 > >       }
 > > +    if (*reset_ovnsb_idl_min_index) {
 > > +        VLOG_INFO("Resetting southbound database cluster state");
 > > +        engine_set_force_recompute(true);
 > > +        ovsdb_idl_reset_min_index(ovnsb_idl);
 > > +        *reset_ovnsb_idl_min_index = false;
 > > +    }
 > >   }
 > >
 > >   static void
 > > @@ -1936,6 +1943,11 @@ main(int argc, char *argv[])
> >       unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd,
 > >                                NULL);
 > >
 > > +    bool reset_ovnsb_idl_min_index = false;
 > > +    unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
 > > +                             cluster_state_reset_cmd,
 > > +                             &reset_ovnsb_idl_min_index);
 > > +
 > >       unsigned int ovs_cond_seqno = UINT_MAX;
 > >       unsigned int ovnsb_cond_seqno = UINT_MAX;
 > >
 > > @@ -1957,7 +1969,8 @@ main(int argc, char *argv[])
 > >               ovs_cond_seqno = new_ovs_cond_seqno;
 > >           }
 > >
> > -        update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all); > > +        update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all,
 > > +                     &reset_ovnsb_idl_min_index);
 > >           update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
> > ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl));
 > >
 > > @@ -2207,7 +2220,7 @@ main(int argc, char *argv[])
 > >       if (!restart) {
> >           bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl);
 > >           while (!done) {
 > > -            update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL);
> > +            update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false); > > update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
 > >
 > >               struct ovsdb_idl_txn *ovs_idl_txn
> > @@ -2441,3 +2454,14 @@ engine_recompute_cmd(struct unixctl_conn *conn OVS_UNUSED, int argc OVS_UNUSED,
 > >       poll_immediate_wake();
 > >       unixctl_command_reply(conn, NULL);
 > >   }
 > > +
 > > +static void
> > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
 > > +               const char *argv[] OVS_UNUSED, void *idl_reset_)
 > > +{
 > > +    bool *idl_reset = idl_reset_;
 > > +
 > > +    *idl_reset = true;
 > > +    poll_immediate_wake();
 > > +    unixctl_command_reply(conn, NULL);
 > > +}
 > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
 > > index 1f817423a..ddb296aaa 100644
 > > --- a/northd/ovn-northd.8.xml
 > > +++ b/northd/ovn-northd.8.xml
 > > @@ -96,6 +96,34 @@
> >           acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if
 > >           this instance is paused.
 > >         </dd>
 > > +
 > > +      <dt><code>sb-cluster-state-reset</code></dt>
 > > +      <dd>
 > > +      <p>
> > +        Reset southbound database cluster status when databases are destroyed
 > > +        and rebuilt.
 > > +      </p>
 > > +      <p>
> > +        If all databases in a clustered southbound database are removed from > > +        disk, then the stored index of all databases will be reset to zero. > > +        This will cause ovn-northd to be unable to read or write to the > > +        southbound database, because it will always detect the data as stale. > > +        In such a case, run this command so that ovn-northd will reset its > > +        local index so that it can interact with the southbound database again.
 > > +      </p>
 > > +      </dd>
 > > +
 > > +      <dt><code>nb-cluster-state-reset</code></dt>
 > > +      <dd>
 > > +      <p>
> > +        Reset northbound database cluster status when databases are destroyed
 > > +        and rebuilt.
 > > +      </p>
 > > +      <p>
> > +        This performs the same task as <code>sb-cluster-state-reset</code>
 > > +        except for the northbound database client.
 > > +      </p>
 > > +      </dd>
 > >         </dl>
 > >       </p>
 > >
 > > diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
 > > index 742aad85e..3714488bb 100644
 > > --- a/northd/ovn-northd.c
 > > +++ b/northd/ovn-northd.c
 > > @@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause;
 > >   static unixctl_cb_func ovn_northd_resume;
 > >   static unixctl_cb_func ovn_northd_is_paused;
 > >   static unixctl_cb_func ovn_northd_status;
 > > +static unixctl_cb_func cluster_state_reset_cmd;
 > >
 > >   struct northd_context {
 > >       struct ovsdb_idl *ovnnb_idl;
 > > @@ -11783,6 +11784,16 @@ main(int argc, char *argv[])
 > >                                &state);
> >       unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state);
 > >
 > > +    bool reset_ovnsb_idl_min_index = false;
 > > +    unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
 > > +                             cluster_state_reset_cmd,
 > > +                             &reset_ovnsb_idl_min_index);
 > > +
 > > +    bool reset_ovnnb_idl_min_index = false;
 > > +    unixctl_command_register("nb-cluster-state-reset", "", 0, 0,
 > > +                             cluster_state_reset_cmd,
 > > +                             &reset_ovnnb_idl_min_index);
 > > +
 > >       daemonize_complete();
 > >
 > >       /* We want to detect (almost) all changes to the ovn-nb db. */
 > > @@ -12068,6 +12079,18 @@ main(int argc, char *argv[])
> >           ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl, northd_probe_interval); > >           ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl, northd_probe_interval);
 > >
 > > +        if (reset_ovnsb_idl_min_index) {
 > > +            VLOG_INFO("Resetting southbound database cluster state");
 > > +            ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);
 > > +            reset_ovnsb_idl_min_index = false;
 > > +        }
 > > +
 > > +        if (reset_ovnnb_idl_min_index) {
 > > +            VLOG_INFO("Resetting northbound database cluster state");
 > > +            ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl);
 > > +            reset_ovnnb_idl_min_index = false;
 > > +        }
 > > +
 > >           poll_block();
 > >           if (should_service_stop()) {
 > >               exiting = true;
> > @@ -12146,3 +12169,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc OVS_UNUSED,
 > >       unixctl_command_reply(conn, ds_cstr(&s));
 > >       ds_destroy(&s);
 > >   }
 > > +
 > > +static void
> > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
 > > +               const char *argv[] OVS_UNUSED, void *idl_reset_)
 > > +{
 > > +    bool *idl_reset = idl_reset_;
 > > +
 > > +    *idl_reset = true;
 > > +    poll_immediate_wake();
 > > +    unixctl_command_reply(conn, NULL);
 > > +}
 > >
 >
 > _______________________________________________
 > dev mailing list
 > [email protected] <mailto:[email protected]>
 > https://mail.openvswitch.org/mailman/listinfo/ovs-dev

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to