Currently we're only tracking the last refresh time and perform reconciliation of non-active connections on every refresh. This is causing issues in large clusters when tunnels are added sequentially. Consider the following example:
1. Tun-1 added -> refresh() -> Tun-1: adding 'in' and starting 'out'. 2. Tun-2 added -> refresh() -> Tun-2: adding 'in' and starting 'out'. -> Tun-1: The other side didn't have time to initiate the 'in' connection yet, so it is not active. But we see that it's not active and trying to start it. 3. Tun-3 added -> refresh() -> Tun-3: adding 'in' and starting 'out'. -> Tun-2: The other side didn't have time to initiate the 'in' connection yet, so it is not active. But we see that it's not active and trying to start it. -> Tun-1: The connection still had no time to become active, but we declare it 'defunct' and re-creating. Behavior above is specific to Libreswan 4. Libreswan 5 will report UP connections as active in most cases, so they will not be marked as defunct, but they will still be started quickly after addition when it is not needed. This creates unnecessary churn in the cluster and puts Libreswan into an uncomfortable position where crossing stream issues (where both sides are trying to establish the same connection at the same time) are far more likely. Fix that by specifically tracking time between reconciliation attempts instead of just the last time we refreshed for any reason. This should make ovs-monitor-ipsec to actually wait for the reconciliation interval before attempting to repair connections and give Libreswan a decent amount of time to process the changes and try to establish connections normally. Fixes: 25a301822e0d ("ipsec: libreswan: Reconcile missing connections periodically.") Reported-at: https://issues.redhat.com/browse/FDP-1364 Signed-off-by: Ilya Maximets <i.maxim...@ovn.org> --- ipsec/ovs-monitor-ipsec.in | 27 +++++++---- tests/system-ipsec.at | 94 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 9 deletions(-) diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in index 7309bdf3c..53111a7c8 100755 --- a/ipsec/ovs-monitor-ipsec.in +++ b/ipsec/ovs-monitor-ipsec.in @@ -515,7 +515,7 @@ conn prevent_unencrypted_vxlan self.IPSEC_CTL = libreswan_root_prefix + ipsec_ctl self.conf_file = None self.conns_not_active = set() - self.last_refresh = time.time() + self.last_reconciliation = time.time() self.secrets_file = None self.use_default_conn = self.IPSEC_CONF == self.ROOT_IPSEC_CONF self.use_default_crypto = args.use_default_crypto @@ -656,6 +656,10 @@ conn prevent_unencrypted_vxlan set(loaded_conns.keys()) | \ set(active_conns.keys()) + reconcile = False + if time.time() - self.last_reconciliation >= RECONCILIATION_INTERVAL: + reconcile = True + for name in all_names: desired = set(self.get_conn_names(monitor, name)) loaded = set(loaded_conns.get(name, dict()).keys()) @@ -663,11 +667,13 @@ conn prevent_unencrypted_vxlan # Untrack connections that became active. self.conns_not_active.difference_update(active) - # Remove connections that didn't become active after --start - # and another explicit --up. - for conn in self.conns_not_active & loaded: - self._delete_ipsec_connection(conn, "is defunct") - loaded.remove(conn) + + if reconcile: + # Remove connections that didn't become active after --start + # and another explicit --up. + for conn in self.conns_not_active & loaded: + self._delete_ipsec_connection(conn, "is defunct") + loaded.remove(conn) # Remove all the loaded or active but not desired connections. for conn in loaded | active: @@ -699,7 +705,7 @@ conn prevent_unencrypted_vxlan else: vlog.info("Starting ipsec connection %s" % conn) self._start_ipsec_connection(conn, "start") - else: + elif reconcile: # Ask pluto to bring UP connections that are loaded, # but not active for some reason. # @@ -746,7 +752,10 @@ conn prevent_unencrypted_vxlan "--delete", "--asynchronous", "prevent_unencrypted_vxlan"]) monitor.conf_in_use["skb_mark"] = monitor.conf["skb_mark"] - self.last_refresh = time.time() + + if reconcile: + self.last_reconciliation = time.time() + vlog.info("Refreshing is done.") def get_conns_from_status(self, pattern): @@ -809,7 +818,7 @@ conn prevent_unencrypted_vxlan return conns def need_to_reconcile(self, monitor): - if time.time() - self.last_refresh < RECONCILIATION_INTERVAL: + if time.time() - self.last_reconciliation < RECONCILIATION_INTERVAL: return False conns_dict = self.get_active_conns() diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at index de3ecd361..5828ad437 100644 --- a/tests/system-ipsec.at +++ b/tests/system-ipsec.at @@ -535,6 +535,100 @@ AT_CHECK([grep -q -E "(ike|ikev2|esp)=" $ovs_base/right/custom.conf], [1]) OVS_TRAFFIC_VSWITCHD_STOP() AT_CLEANUP +AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv4]) +AT_KEYWORDS([ipsec libreswan ipv4 geneve psk reconciliation]) +dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645 +dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988 + +CHECK_LIBRESWAN() +OVS_TRAFFIC_VSWITCHD_START() +IPSEC_SETUP_UNDERLAY() + +m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish]) + +dnl Set up only the left host. +IPSEC_ADD_NODE_LEFT([10.1.1.1], [10.1.1.254]) +IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([10.1.1.2])) +dnl Wait for the monitor to find the new connection. +OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log]) + +dnl Add a few more tunels waiting for the monitor to find them too. +m4_for([id], [1], 5, [1], [ + OVS_VSCTL([left], add-port br-ipsec tun-id \ + -- set Interface tun-id type=geneve PSK_OPTIONS(10.1.2.id)) + OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log) +]) +dnl And now remove all the extra tunnels. +m4_for([id], [1], 5, [1], [ + OVS_VSCTL([left], del-port tun-id) + OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log) +]) + +dnl Check that none of the connections were marked for reconciliation yet. +dnl It should take at least 15 seconds for this to happen. +AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1]) +AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1]) +dnl But we should eventually attempt to reconcile the original 'tun' tunnel. +OVS_WAIT_UNTIL([grep -q 'do not match desired' left/ovs-monitor-ipsec.log]) +OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \ + left/ovs-monitor-ipsec.log]) + +dnl Now add the right host and check that connection works properly. +IPSEC_ADD_NODE_RIGHT([10.1.1.2], [10.1.1.254]) +IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([10.1.1.1])) + +CHECK_ESP_TRAFFIC + +OVS_TRAFFIC_VSWITCHD_STOP() +AT_CLEANUP + +AT_SETUP([IPsec -- Libreswan - reconciliation interval is respected - ipv6]) +AT_KEYWORDS([ipsec libreswan ipv6 geneve psk reconciliation]) +dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645 +dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988 + +CHECK_LIBRESWAN() +OVS_TRAFFIC_VSWITCHD_START() +IPSEC_SETUP_UNDERLAY() + +m4_define([PSK_OPTIONS], [options:remote_ip=$1 options:psk=swordfish]) + +dnl Set up only the left host. +IPSEC_ADD_NODE_LEFT([fd01::101], [fd01::254]) +IPSEC_ADD_TUNNEL_LEFT([geneve], PSK_OPTIONS([fd01::102])) +dnl Wait for the monitor to find the new connection. +OVS_WAIT_UNTIL([grep -q 'tun appeared' left/ovs-monitor-ipsec.log]) + +dnl Add a few more tunels waiting for the monitor to find them too. +m4_for([id], [1], 5, [1], [ + OVS_VSCTL([left], add-port br-ipsec tun-id \ + -- set Interface tun-id type=geneve PSK_OPTIONS(fd02::id)) + OVS_WAIT_UNTIL(grep -q 'tun-id appeared' left/ovs-monitor-ipsec.log) +]) +dnl And now remove all the extra tunnels. +m4_for([id], [1], 5, [1], [ + OVS_VSCTL([left], del-port tun-id) + OVS_WAIT_UNTIL(grep -q 'tun-id disappeared' left/ovs-monitor-ipsec.log) +]) + +dnl Check that none of the connections were marked for reconciliation yet. +dnl It should take at least 15 seconds for this to happen. +AT_CHECK([grep -E 'half-loaded|defunct' left/ovs-monitor-ipsec.log], [1]) +AT_CHECK([grep 'Bringing up' left/ovs-monitor-ipsec.log], [1]) +dnl But we should eventually attempt to reconcile the original 'tun' tunnel. +OVS_WAIT_UNTIL([grep -q 'do not match desired' left/ovs-monitor-ipsec.log]) +OVS_WAIT_UNTIL([grep -qE 'Bringing up ipsec connection tun-[[io]]' \ + left/ovs-monitor-ipsec.log]) + +dnl Now add the right host and check that connection works properly. +IPSEC_ADD_NODE_RIGHT([fd01::102], [fd01::254]) +IPSEC_ADD_TUNNEL_RIGHT([geneve], PSK_OPTIONS([fd01::101])) + +CHECK_ESP_TRAFFIC + +OVS_TRAFFIC_VSWITCHD_STOP() +AT_CLEANUP + AT_SETUP([IPsec -- Libreswan - established conns survive new additions - ipv4]) AT_KEYWORDS([ipsec libreswan ipv4 geneve psk persistence]) dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645 -- 2.49.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev