On 30/10/2024 15:50, Ilya Maximets wrote:
> There are cases where ipsec commands may fail to add new connections or
> remove the old ones. Unfortunately, this means that those connections
> may actually never be added or removed, since ovs-monitor-ipsec will
> not re-visit them, unless something else changes.
>
> Wake up the monitor periodically to check if something changed in the
> system or if some connections still need loading.
>
> This addresses two main use cases:
>
> 1. Connection failed to start for some reason and was not added
> to pluto or properly started. The logic will go over all the
> desired, loaded and active connections and make sure that
> any undesired connections are removed, non-loaded connections
> are loaded and non-active connections are brought UP.
>
> 2. If pluto re-starts it loads all the connections, but doesn't
> bring them up, because we're using route (ondemand) activation
> strategy. This change in this commit will notice all the
> loaded but not active connections and will bring them up.
> This helps avoiding packet drops on first packets until the
> connection activates.
>
> Choosing 15 seconds as an interval to wake up to give pluto some
> breathing room, i.e. a chance to activate the connections properly
> before we start poking them. And also if pluto is down, 15 second
> interval will create less spam in the logs.
>
> StrongSwan doesn't need such a logic, because it supports a single
> command 'ipsec update' that re-loads the config as a whole and
> figures out what configuration changes are needed. But since we're
> starting all the connections separately with Libreswan, we have to
> keep track and reconcile manually.
>
> Some more details of the logic are in the comments in the code.
>
> Signed-off-by: Ilya Maximets <[email protected]>
> ---
> ipsec/ovs-monitor-ipsec.in | 185 ++++++++++++++++++++++++-------------
> 1 file changed, 123 insertions(+), 62 deletions(-)
>
> diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in
> index 14004f924..0ac6297bb 100755
> --- a/ipsec/ovs-monitor-ipsec.in
> +++ b/ipsec/ovs-monitor-ipsec.in
> @@ -20,6 +20,7 @@ import os
> import re
> import subprocess
> import sys
> +import time
> from string import Template
>
> import ovs.daemon
> @@ -82,6 +83,7 @@ vlog = ovs.vlog.Vlog("ovs-monitor-ipsec")
> exiting = False
> monitor = None
> xfrm = None
> +RECONCILIATION_INTERVAL = 15 # seconds
>
>
> def run_command(args, description=None):
> @@ -295,6 +297,9 @@ conn prevent_unencrypted_vxlan
>
> return conns
>
> + def need_to_reconcile(self, monitor):
> + return False
> +
> def config_init(self):
> self.conf_file = open(self.IPSEC_CONF, "w")
> self.secrets_file = open(self.IPSEC_SECRETS, "w")
> @@ -511,6 +516,7 @@ conn prevent_unencrypted_vxlan
> self.IPSEC_D = "sql:" + libreswan_root_prefix + ipsec_d
> self.IPSEC_CTL = libreswan_root_prefix + ipsec_ctl
> self.conf_file = None
> + self.last_refresh = time.time()
> self.secrets_file = None
> vlog.dbg("Using: " + self.IPSEC)
> vlog.dbg("Configuration file: " + self.IPSEC_CONF)
> @@ -622,51 +628,50 @@ conn prevent_unencrypted_vxlan
> "--config", self.IPSEC_CONF,
> "--rereadsecrets"],
> "re-read secrets")
> - tunnels = set(monitor.tunnels.keys())
> -
> - # Delete old connections
> - conns_dict = self.get_active_conns()
> - for ifname, conns in conns_dict.items():
> - tunnel = monitor.tunnels.get(ifname)
> -
> - for conn in conns:
> - # IPsec "connection" names must start with Interface name
> - if not conn.startswith(ifname):
> - vlog.err("%s does not start with %s" % (conn, ifname))
> - continue
> -
> - # version number should be the first integer after
> - # interface name in IPsec "connection"
> - try:
> - ver = int(re.findall(r'\d+', conn[len(ifname):])[0])
> - except ValueError:
> - vlog.err("%s does not contain version number")
> - continue
> - except IndexError:
> - vlog.err("%s does not contain version number")
> - continue
>
> - if not tunnel or tunnel.version != ver:
> - vlog.info("%s is outdated %u" % (conn, ver))
> - run_command(self.IPSEC_AUTO +
> - ["--ctlsocket", self.IPSEC_CTL,
> - "--config", self.IPSEC_CONF,
> - "--delete", conn], "delete %s" % conn)
> - elif ifname in tunnels:
> - tunnels.remove(ifname)
> -
> - # Activate new connections
> - for name in tunnels:
> - ver = monitor.tunnels[name].version
> -
> - if monitor.tunnels[name].conf["tunnel_type"] == "gre":
> - conn = "%s-%s" % (name, ver)
> - self._start_ipsec_connection(conn)
> + loaded_conns = self.get_loaded_conns()
> + active_conns = self.get_active_conns()
> +
> + all_names = set(monitor.tunnels.keys()) | \
> + set(loaded_conns.keys()) | \
> + set(active_conns.keys())
> +
> + for name in all_names:
> + desired = set(self.get_conn_names(monitor, name))
> + loaded = set(loaded_conns.get(name, dict()).keys())
> + active = set(active_conns.get(name, dict()).keys())
> +
> + # Remove all the loaded or active but not desired connections.
> + for conn in loaded | active:
> + if conn not in desired:
> + self._delete_ipsec_connection(conn, "is outdated")
> + loaded.discard(conn)
> + active.discard(conn)
> +
> + # If not all desired are loaded, remove all the loaded and
> + # active for this tunnel and re-load only the desired ones.
> + # Need to do that, because connections for the same tunnel
> + # may share SAs. If one is loaded and the other is not,
> + # it means the second one failed, so the shared SA may be in
> + # a broken state.
> + if desired != loaded:
> + for conn in loaded | active:
> + self._delete_ipsec_connection(conn, "is half-loaded")
> + loaded.discard(conn)
> + active.discard(conn)
> +
> + for conn in desired:
> + vlog.info("Starting ipsec connection %s" % conn)
> + self._start_ipsec_connection(conn, "start")
> else:
> - conn_in = "%s-in-%s" % (name, ver)
> - conn_out = "%s-out-%s" % (name, ver)
> - self._start_ipsec_connection(conn_in)
> - self._start_ipsec_connection(conn_out)
> + # Ask pluto to bring UP connections that are loaded,
> + # but not active for some reason.
> + #
> + # desired == loaded and desired >= loaded + active,
> + # so loaded >= active
> + for conn in loaded - active:
> + vlog.info("Bringing up ipsec connection %s" % conn)
> + self._start_ipsec_connection(conn, "up")
>
> # Update shunt policy if changed
> if monitor.conf_in_use["skb_mark"] != monitor.conf["skb_mark"]:
> @@ -713,23 +718,27 @@ conn prevent_unencrypted_vxlan
> "--delete",
> "--asynchronous", "prevent_unencrypted_vxlan"])
> monitor.conf_in_use["skb_mark"] = monitor.conf["skb_mark"]
> + self.last_refresh = time.time()
> + vlog.info("Refreshing is done.")
>
> - def get_active_conns(self):
> + def get_conns_from_status(self, pattern):
> """This function parses output from 'ipsec status' command.
> It returns dictionary where <key> is interface name (as in OVSDB)
> and <value> is another dictionary. This another dictionary
> uses LibreSwan connection name as <key> and more detailed
> - sample line from the parsed outpus as <value>. """
> + sample line from the parsed outpus as <value>. 'pattern' should
> + be a regular expression that parses out the connection name.
> + Only the lines that match the pattern will be parsed. """
>
> conns = {}
> ret, pout, perr = run_command([self.IPSEC, 'status',
> '--ctlsocket', self.IPSEC_CTL],
> - "get active connections")
> + "get ipsec status")
> if ret:
> return conns
>
> for line in pout.splitlines():
> - m = re.search(r"#\d+: .*\"(.*)\".*", line)
> + m = re.search(pattern, line)
> if not m:
> continue
>
> @@ -748,25 +757,76 @@ conn prevent_unencrypted_vxlan
>
> return conns
>
> - def _start_ipsec_connection(self, conn):
> - # In a corner case, LibreSwan daemon restarts for some reason and
> - # the "ipsec auto --start" command is lost. Just retry to make sure
> - # the command is received by LibreSwan.
> - while True:
> - ret, pout, perr = run_command(self.IPSEC_AUTO +
> - ["--config", self.IPSEC_CONF,
> - "--ctlsocket", self.IPSEC_CTL,
> - "--start",
> - "--asynchronous", conn],
> - "start %s" % conn)
> - if not re.match(r".*Connection refused.*", perr) and \
> - not re.match(r".*need --listen.*", pout):
> - break
> + def get_active_conns(self):
> + return self.get_conns_from_status(r"#\d+: .*\"(.*)\".*")
> +
> + def get_loaded_conns(self):
> + return self.get_conns_from_status(r"\"(.*)\": \d+.*(===|\.\.\.).*")
> +
> + def get_conn_names(self, monitor, ifname):
> + conns = []
> + if ifname not in monitor.tunnels:
> + return conns
> +
> + tunnel = monitor.tunnels.get(ifname)
> + ver = tunnel.version
> +
> + if tunnel.conf["tunnel_type"] == "gre":
> + conns.append("%s-%s" % (ifname, ver))
> + else:
> + conns.append("%s-in-%s" % (ifname, ver))
> + conns.append("%s-out-%s" % (ifname, ver))
> +
> + return conns
> +
> + def need_to_reconcile(self, monitor):
> + if time.time() - self.last_refresh < RECONCILIATION_INTERVAL:
> + return False
> +
> + conns_dict = self.get_active_conns()
> + for ifname, tunnel in monitor.tunnels.items():
> + if ifname not in conns_dict:
> + vlog.info("Connection for port %s is not active, "
> + "need to reconcile" % ifname)
> + return True
> +
> + existing_conns = conns_dict.get(ifname)
> + desired_conns = self.get_conn_names(monitor, ifname)
> +
> + if set(existing_conns.keys()) != set(desired_conns):
> + vlog.info("Active connections for port %s %s do not match "
> + "desired %s, need to reconcile"
> + % (ifname, list(existing_conns.keys()),
> + desired_conns))
> + return True
> +
> + return False
> +
> + def _delete_ipsec_connection(self, conn, reason):
> + vlog.info("%s %s, removing" % (conn, reason))
> + run_command(self.IPSEC_AUTO +
> + ["--ctlsocket", self.IPSEC_CTL,
> + "--config", self.IPSEC_CONF,
> + "--delete", conn], "delete %s" % conn)
> +
> + def _start_ipsec_connection(self, conn, action):
> + ret, pout, perr = run_command(self.IPSEC_AUTO +
> + ["--config", self.IPSEC_CONF,
> + "--ctlsocket", self.IPSEC_CTL,
> + "--" + action,
> + "--asynchronous", conn],
> + "%s %s" % (action, conn))
>
> if re.match(r".*[F|f]ailed to initiate connection.*", pout):
> vlog.err('Failed to initiate connection through'
> ' Interface %s.\n' % (conn.split('-')[0]))
> vlog.err("stdout: %s" % pout)
> + ret = 1
> +
> + if ret:
> + # We don't know in which state the connection was left on
> + # failure. Try to clean it up.
> + self._delete_ipsec_connection(conn, "--%s failed" % action)
>
> def _nss_clear_database(self):
> """Remove all OVS IPsec related state from the NSS database"""
> @@ -1192,7 +1252,7 @@ class IPsecMonitor(object):
> self.ike_helper.clear_tunnel_state(self.tunnels[name])
> del self.tunnels[name]
>
> - if needs_refresh:
> + if needs_refresh or self.ike_helper.need_to_reconcile(self):
> self.ike_helper.refresh(self)
>
> def _get_cn_from_cert(self, cert):
> @@ -1365,6 +1425,7 @@ def main():
> poller = ovs.poller.Poller()
> unixctl_server.wait(poller)
> idl.wait(poller)
> + poller.timer_wait(RECONCILIATION_INTERVAL * 1000)
> poller.block()
>
> unixctl_server.close()
Acked-by: Roi Dayan <[email protected]>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev