If the user triggers a pause command to the ovn-controller the current implementation will wait for commands from unixctl server only and ignore the other component.
This implementation works fine if we don't have inactivity_probe set in the SB DataBase, but once the user sets the inactivity_probe in SB DataBase the connection will be dropped by the SBDB. Once the controller resumes the execution it will try to commit some changes to the SBDB but the transaction will fail since we lost the connection to the SBDB and the controller must reconnect before committing the transaction again. To avoid the above scenario the controller can keep handling unconditional IDL messages to avoid reconnecting to SB. Signed-off-by: Mohammad Heib <[email protected]> --- controller/ovn-controller.c | 16 +++++++++--- ovn-sb.xml | 2 +- tests/ovn-controller.at | 51 +++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 856e5e270..d2c8f66d9 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -5534,12 +5534,22 @@ main(int argc, char *argv[]) simap_destroy(&usage); } - /* If we're paused just run the unixctl server and skip most of the - * processing loop. + /* If we're paused just run the unixctl-server/unconditional IDL and + * skip most of the processing loop. */ if (paused) { unixctl_server_run(unixctl); + int ovnsb_seq = ovsdb_idl_get_seqno(ovnsb_idl_loop.idl); + ovsdb_idl_run(ovnsb_idl_loop.idl); + int new_ovnsb_seq = ovsdb_idl_get_seqno(ovnsb_idl_loop.idl); + /* If the IDL content has changed while the controller is + * in pause state, trigger a recompute. + */ + if (new_ovnsb_seq != ovnsb_seq) { + engine_set_force_recompute(true); + } unixctl_server_wait(unixctl); + ovsdb_idl_wait(ovnsb_idl_loop.idl); goto loop_done; } @@ -6009,7 +6019,6 @@ main(int argc, char *argv[]) OVS_NOT_REACHED(); } - ovsdb_idl_track_clear(ovnsb_idl_loop.idl); ovsdb_idl_track_clear(ovs_idl_loop.idl); lflow_cache_run(ctrl_engine_ctx.lflow_cache); @@ -6017,6 +6026,7 @@ main(int argc, char *argv[]) loop_done: memory_wait(); + ovsdb_idl_track_clear(ovnsb_idl_loop.idl); poll_block(); if (should_service_stop()) { exit_args.exiting = true; diff --git a/ovn-sb.xml b/ovn-sb.xml index e393f92b3..43c13f23c 100644 --- a/ovn-sb.xml +++ b/ovn-sb.xml @@ -4308,7 +4308,7 @@ tcp.flags = RST; <column name="inactivity_probe"> Maximum number of milliseconds of idle time on connection to the client before sending an inactivity probe message. If Open vSwitch does not - communicate with the client for the specified number of seconds, it + communicate with the client for the specified number of milliseconds,it will send a probe. If a response is not received for the same additional amount of time, Open vSwitch assumes the connection has been broken and attempts to reconnect. Default is implementation-specific. diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at index 9d2a37c72..04e4c52e7 100644 --- a/tests/ovn-controller.at +++ b/tests/ovn-controller.at @@ -352,6 +352,57 @@ OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP ]) +# Check that the connection to the Southbound database +# is not dropped when probe-interval is set and the controller +# is in pause state. +OVN_FOR_EACH_NORTHD([ +AT_SETUP([ovn-controller - check sbdb connection while pause]) +AT_KEYWORDS([ovn]) +ovn_start + +net_add n1 +sim_add hv +as hv +ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.1 + +ovs-vsctl -- add-port br-int hv1-vif1 -- \ + set interface hv1-vif1 external-ids:iface-id=sw0-p1 \ + options:tx_pcap=hv1/vif1-tx.pcap \ + options:rxq_pcap=hv1/vif1-rx.pcap \ + ofport-request=1 + +ovs-vsctl set open . external_ids:ovn-remote-probe-interval=100000 +ovn-sbctl set connection . inactivity_probe=1 + +ovn-nbctl ls-add sw0 +check ovn-nbctl lsp-add sw0 sw0-p1 +check ovn-nbctl lsp-set-addresses sw0-p1 "50:54:00:00:00:03 10.0.0.3" +check ovn-nbctl lsp-set-port-security sw0-p1 "50:54:00:00:00:03 10.0.0.3" +ovn-nbctl --wait=hv sync + +sleep_controller hv +# Trigger DB change to make SBDB connect to controller. +check ovn-nbctl lsp-del sw0-p1 + +# wait for 2 sec to give enough time to the SBDB to drop the connection +# if there is no answer from the controller. The connection should not +# be dropped since we keep handle the idl messages from SBDB even if we +# in pause state. +sleep 2 +wake_up_controller hv + +check ovn-nbctl lsp-add sw0 sw0-p1 +check ovn-nbctl lsp-set-addresses sw0-p1 "50:54:00:00:00:03 10.0.0.3" +check ovn-nbctl lsp-set-port-security sw0-p1 "50:54:00:00:00:03 10.0.0.3" +ovn-nbctl --wait=hv sync + +OVS_WAIT_UNTIL([test 0 = `cat hv/ovn-controller.log| grep -c "OVNSB commit failed, force recompute next time"`]) +OVS_WAIT_UNTIL([test 0 = `cat hv/ovn-controller.log| grep -c "connection closed by peer"`]) +OVN_CLEANUP([hv]) +AT_CLEANUP +]) + # Checks that ovn-controller recreates its chassis record when deleted externally. OVN_FOR_EACH_NORTHD([ AT_SETUP([ovn-controller - Chassis self record]) -- 2.34.3 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
