On 01/11/2024 3:23, Ilya Maximets wrote:
> Expand the NxN test with the network connectivity check between all the
> nodes. Unfortunately, we can't really run this test with Libreswan 4.x,
> since, due to internal issues in these versions, we are getting into
> states where everything is loaded and active, but no traffic can pass.
> This is an internal issue in Libreswan that we can't workaround from
> the outside. So, the fix is required in Libreswan itself. 4.5 and
> earlier versions seem to not be affected by this problem, at least not
> severely affected, but it's easier to just cut off all the 4.x versions
> from the test.
>
> 3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with
> this test.
>
> Test is relatively long, but it is very valuable, IMO. Besides
> stressing ovs-monitor-ipsec with various failure and asynchronous
> connection establishment conditions, which are important for OVS, it
> also was used to reproduce and fix several bugs in Libreswan 4.x.
> Unfortunately, not all the issues are understood and fixed yet.
>
> Signed-off-by: Ilya Maximets <[email protected]>
> ---
> tests/system-ipsec.at | 84 ++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 76 insertions(+), 8 deletions(-)
>
> diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at
> index de459804b..4ab384d89 100644
> --- a/tests/system-ipsec.at
> +++ b/tests/system-ipsec.at
> @@ -71,7 +71,9 @@ m4_define([IPSEC_ADD_NODE],
> on_exit "kill `cat $ovs_base/$1/ovs-monitor-ipsec.pid`"
>
> dnl Set up OVS bridge
> - NS_EXEC([$1], [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec])]
> + NS_CHECK_EXEC([$1],
> + [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec \
> + -- set-controller br-ipsec punix:$ovs_base/br-ipsec.$1.mgmt])]
> )
> m4_define([IPSEC_ADD_NODE_LEFT], [IPSEC_ADD_NODE(left, p0, $1, $2)])
> m4_define([IPSEC_ADD_NODE_RIGHT], [IPSEC_ADD_NODE(right, p1, $1, $2)])
> @@ -429,7 +431,8 @@ m4_for([id], [1], NODES, [1], [
> self-sign node-id], [0], [stdout])
> AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \
> other_config:certificate=${ovs_base}/node-id-cert.pem \
> - other_config:private_key=${ovs_base}/node-id-privkey.pem),
> + other_config:private_key=${ovs_base}/node-id-privkey.pem \
> + -- set bridge br-ipsec other-config:hwaddr=f2:ff:00:00:00:id),
> [0], [ignore], [ignore])
> on_exit "ipsec --rundir $ovs_base/node-id status >
> $ovs_base/node-id/status"
> ])
> @@ -445,11 +448,18 @@ m4_for([LEFT], [1], NODES, [1], [
> fi
> ])])
>
> +dnl These are not necessary, but nice to have in the test log in
> +dnl order to spot pluto failures during the test.
> +on_exit "grep -E 'Timed out|outdated|half-loaded|defunct' \
> + $ovs_base/node-*/ovs-monitor-ipsec.log"
> +on_exit "grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log"
> +
> m4_define([WAIT_FOR_LOADED_CONNS], [
> m4_for([id], [1], NODES, [1], [
> echo "================== node-id ========================="
> iterations=0
> loaded=0
> + active=0
> dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take
> dnl much longer than a default timeout. The default retransmit timeout
> dnl for pluto is 60 seconds. Also, we need to make sure pluto didn't
> @@ -463,8 +473,11 @@ m4_define([WAIT_FOR_LOADED_CONNS], [
> START_PLUTO([node-id])
> else
> loaded=$(IPSEC_STATUS_LOADED(node-id))
> + m4_if([$1], [active],
> + [active=$(IPSEC_STATUS_ACTIVE(node-id))], [active=$loaded])
> fi
> - if test "$loaded" -ne $(( (NODES - 1) * 2 )); then
> + if test "$loaded" -ne "$(( (NODES - 1) * 2 ))" -o \
> + "$loaded" -ne "$active"; then
> sleep 3
> else
> break
> @@ -505,11 +518,66 @@ OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
> dnl Wait for all the connections to be loaded back.
> WAIT_FOR_LOADED_CONNS()
>
> -dnl These are not necessary, but nice to have in the test log in
> -dnl order to spot pluto failures during the test.
> -grep -E 'Timed out|outdated|half-loaded|defunct' \
> - $ovs_base/node-*/ovs-monitor-ipsec.log
> -grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log
> +dnl Next section will check connectivity between all the nodes.
> +dnl Different versions of Libreswan 4.x have issues where connections
> +dnl are not being correctly established or never become active in a
> +dnl way that can not be mitigated from ovs-monitor-ipsec or the test.
> +dnl So, only checking connectivity for Libreswan 3- or 5+.
> +dnl Skipping in the middle of the test, so test can still fail while
> +dnl testing with Libreswan 4, if the first half fails.
> +AT_SKIP_IF([ipsec --version 2>&1 | grep -q 'Libreswan 4\.'])
> +
> +dnl Turn off IPv6 and add static ARP entries for all namespaces to avoid
> +dnl any broadcast / multicast traffic that would otherwise be multiplied
> +dnl by each node creating a traffic storm. Add specific OpenFlow rules
> +dnl to forward traffic to exact destinations without any MAC learning.
> +m4_for([LEFT], [1], NODES, [1], [
> + NS_CHECK_EXEC([node-LEFT], [sysctl -w net.ipv6.conf.all.disable_ipv6=1],
> + [0], [ignore])
> + AT_CHECK([ovs-ofctl del-flows unix:$ovs_base/br-ipsec.node-LEFT.mgmt])
> + AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
> + "dl_dst=f2:ff:00:00:00:LEFT actions=LOCAL"])
> + m4_for([RIGHT], [1], NODES, [1], [
> + if test LEFT -ne RIGHT; then
> + NS_CHECK_EXEC([node-LEFT],
> + [ip neigh add 192.0.0.RIGHT lladdr f2:ff:00:00:00:RIGHT dev
> br-ipsec])
> + AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
> + "dl_dst=f2:ff:00:00:00:RIGHT actions=tun-RIGHT"])
> + fi
> + ])
> +])
> +
> +dnl Bring up and add IP addresses for br-ipsec interface.
> +m4_for([id], [1], NODES, [1], [
> + echo "================== node-id ========================="
> + NS_CHECK_EXEC([node-id], [ip addr add 192.0.0.id/24 dev br-ipsec])
> + NS_CHECK_EXEC([node-id], [ip link set dev br-ipsec up])
> +])
> +
> +dnl Wait for all the connections to be loaded and active. In case one of
> +dnl the pluto processes crashed some of the connections may never become
> +dnl active. But we did run this loop with a pluto reviving logic twice
> +dnl already, so the chances for pluto to be down here are much lower.
> +WAIT_FOR_LOADED_CONNS([active])
> +
> +dnl Check the full mesh ping.
> +m4_for([LEFT], [1], NODES, [1], [
> + m4_for([RIGHT], [1], NODES, [1], [
> + if test LEFT -ne RIGHT; then
> + echo "====== ping: node-LEFT --> node-RIGHT =========="
> + dnl Ping without checking in case connection will recover after the
> + dnl first packet.
> + NS_CHECK_EXEC([node-LEFT],
> + [ping -q -c 1 -W 2 192.0.0.RIGHT | FORMAT_PING],
> + [ignore], [stdout])
> + dnl Now check. If this one fails, there is no actual connectivity.
> + NS_CHECK_EXEC([node-LEFT],
> + [ping -q -c 3 -i 0.1 -W 2 192.0.0.RIGHT | FORMAT_PING],
> + [0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> + fi
> +])])
>
> OVS_TRAFFIC_VSWITCHD_STOP()
> AT_CLEANUP
Acked-by: Roi Dayan <[email protected]>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev