Introduce specif flows for E/W ICMPv{4,6} packets if tunnelled packets do not fit path MTU. This patch enable PMTUD for East/West Geneve traffic.
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2241711 Tested-by: Jaime Ruiz <jcaam...@redhat.com> Signed-off-by: Lorenzo Bianconi <lorenzo.bianc...@redhat.com> --- Changes since v1: - add fix for vxlan and stt tunnels --- NEWS | 1 + controller/physical.c | 45 +++++++++++++++++++++++++++ northd/northd.c | 24 ++++++++++++++ northd/ovn-northd.8.xml | 8 +++++ tests/multinode.at | 69 +++++++++++++++++++++++++++++++++++++++++ tests/ovn-northd.at | 6 ++++ 6 files changed, 153 insertions(+) diff --git a/NEWS b/NEWS index e10fb79dd..acb3b854f 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,7 @@ Post v23.09.0 connection method and doesn't require additional probing. external_ids:ovn-openflow-probe-interval configuration option for ovn-controller no longer matters and is ignored. + - Enable PMTU discovery on geneve tunnels for E/W traffic. OVN v23.09.0 - 15 Sep 2023 -------------------------- diff --git a/controller/physical.c b/controller/physical.c index ba88e1d8b..ce3b88d16 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -2446,6 +2446,51 @@ physical_run(struct physical_ctx *p_ctx, &ofpacts, hc_uuid); } + /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets do not + * fit path MTU. + */ + HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) { + ofpbuf_clear(&ofpacts); + if (tun->type == GENEVE) { + put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts); + put_move(p_ctx->mff_ovn_geneve, 16, MFF_LOG_OUTPORT, 0, 16, + &ofpacts); + put_move(p_ctx->mff_ovn_geneve, 0, MFF_LOG_INPORT, 0, 15, + &ofpacts); + } else if (tun->type == STT) { + put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 24, &ofpacts); + put_move(MFF_TUN_ID, 40, MFF_LOG_OUTPORT, 0, 16, &ofpacts); + put_move(MFF_TUN_ID, 24, MFF_LOG_INPORT, 0, 15, &ofpacts); + } else if (tun->type == VXLAN) { + put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 12, &ofpacts); + put_move(MFF_TUN_ID, 12, MFF_LOG_INPORT, 0, 12, &ofpacts); + } else { + OVS_NOT_REACHED(); + } + put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts); + + /* IPv4 */ + struct match match = MATCH_CATCHALL_INITIALIZER; + match_set_in_port(&match, tun->ofport); + match_set_dl_type(&match, htons(ETH_TYPE_IP)); + match_set_nw_proto(&match, IPPROTO_ICMP); + match_set_icmp_type(&match, 3); + match_set_icmp_code(&match, 4); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, + &ofpacts, hc_uuid); + /* IPv6 */ + match_init_catchall(&match); + match_set_in_port(&match, tun->ofport); + match_set_dl_type(&match, htons(ETH_TYPE_IPV6)); + match_set_nw_proto(&match, IPPROTO_ICMPV6); + match_set_icmp_type(&match, 2); + match_set_icmp_code(&match, 0); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 120, 0, &match, + &ofpacts, hc_uuid); + } + /* Add VXLAN specific rules to transform port keys * from 12 bits to 16 bits used elsewhere. */ HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) { diff --git a/northd/northd.c b/northd/northd.c index 29245d633..07dffb15a 100644 --- a/northd/northd.c +++ b/northd/northd.c @@ -12765,6 +12765,28 @@ build_lrouter_force_snat_flows(struct hmap *lflows, struct ovn_datapath *od, ds_destroy(&actions); } +/* Following flows are used to manage traffic redirected by the kernel + * (e.g. ICMP errors packets) that enter the cluster from the geneve ports + */ +static void +build_lrouter_redirected_traffic_flows( + struct ovn_port *op, struct hmap *lflows, + struct ds *match, struct ds *actions) +{ + ovs_assert(op->nbrp); + if (!is_l3dgw_port(op)) { + return; + } + + ds_clear(match); + ds_put_format(match, "inport == %s && !is_chassis_resident(%s)", + op->cr_port->json_key, op->cr_port->json_key); + ds_clear(actions); + ds_put_format(actions, "inport = %s; next;", op->json_key); + ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 120, + ds_cstr(match), ds_cstr(actions)); +} + static void build_lrouter_force_snat_flows_op(struct ovn_port *op, struct hmap *lflows, @@ -16168,6 +16190,8 @@ build_lswitch_and_lrouter_iterate_by_lrp(struct ovn_port *op, &lsi->match, &lsi->actions, lsi->meter_groups); build_lrouter_force_snat_flows_op(op, lsi->lflows, &lsi->match, &lsi->actions); + build_lrouter_redirected_traffic_flows(op, lsi->lflows, &lsi->match, + &lsi->actions); } static void * diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml index 97718821f..972854684 100644 --- a/northd/ovn-northd.8.xml +++ b/northd/ovn-northd.8.xml @@ -2463,6 +2463,14 @@ output; (LBs, NAT). </p> + <p> + For each gateway port <var>GW</var> on a distributed logical router + a priority-120 flow that matches <code>inport == <var>cr-GW</var> + && !is_chassis_resident(<var>cr-GW</var>)</code> where + <var>cr-GW</var> is the chassis resident port of <var>GW</var>, + stores <var>GW</var> as inport. + </p> + <p> For a distributed logical router or for gateway router where the port is configured with <code>options:gateway_mtu</code> diff --git a/tests/multinode.at b/tests/multinode.at index 2b199b4bc..a051ea8c2 100644 --- a/tests/multinode.at +++ b/tests/multinode.at @@ -72,3 +72,72 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | F ]) AT_CLEANUP + +AT_SETUP([ovn multinode geneve pmtu]) + +# Check that ovn-fake-multinode setup is up and running +check_fake_multinode_setup + +# Delete the multinode NB and OVS resources before starting the test. +cleanup_multinode_resources + +# Test East-West switching +check multinode_nbctl ls-add sw0 +check multinode_nbctl lsp-add sw0 sw0-port1 +check multinode_nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03 10.0.0.3 1000::3" +check multinode_nbctl lsp-add sw0 sw0-port2 +check multinode_nbctl lsp-set-addresses sw0-port2 "50:54:00:00:00:04 10.0.0.4 1000::4" + +m_as ovn-chassis-1 /data/create_fake_vm.sh sw0-port1 sw0p1 50:54:00:00:00:03 10.0.0.3 24 10.0.0.1 1000::3/64 1000::a +m_as ovn-chassis-2 /data/create_fake_vm.sh sw0-port2 sw0p2 50:54:00:00:00:04 10.0.0.4 24 10.0.0.1 1000::4/64 1000::a + +m_wait_for_ports_up + +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | FORMAT_PING], \ +[0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +# Create the second logical switch with one port +check multinode_nbctl ls-add sw1 +check multinode_nbctl lsp-add sw1 sw1-port1 +check multinode_nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03 20.0.0.3 2000::3" + +# Create a logical router and attach both logical switches +check multinode_nbctl lr-add lr0 +check multinode_nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24 1000::a/64 +check multinode_nbctl lsp-add sw0 sw0-lr0 +check multinode_nbctl lsp-set-type sw0-lr0 router +check multinode_nbctl lsp-set-addresses sw0-lr0 router +check multinode_nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0 + +check multinode_nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02 20.0.0.1/24 2000::a/64 +check multinode_nbctl lsp-add sw1 sw1-lr0 +check multinode_nbctl lsp-set-type sw1-lr0 router +check multinode_nbctl lsp-set-addresses sw1-lr0 router +check multinode_nbctl lsp-set-options sw1-lr0 router-port=lr0-sw1 + +m_as ovn-chassis-2 /data/create_fake_vm.sh sw1-port1 sw1p1 40:54:00:00:00:03 20.0.0.3 24 20.0.0.1 2000::4/64 1000::a + +m_wait_for_ports_up sw1-port1 + +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | FORMAT_PING], \ +[0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +# Change ptmu for the geneve tunnel +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1 +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1142"]) + +check multinode_nbctl lrp-set-gateway-chassis lr0-sw0 ovn-chassis-1 10 +check multinode_nbctl lrp-set-gateway-chassis lr0-sw1 ovn-chassis-2 10 + +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1]) +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1]) +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1]) + +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1 +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1142"]) + +AT_CLEANUP diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at index a8099786c..e7910e83c 100644 --- a/tests/ovn-northd.at +++ b/tests/ovn-northd.at @@ -6480,6 +6480,9 @@ AT_CAPTURE_FILE([lrflows]) # Check the flows in lr_in_admission stage AT_CHECK([grep lr_in_admission lrflows | grep cr-DR | sort], [0], [dnl + table=0 (lr_in_admission ), priority=120 , match=(inport == "cr-DR-S1" && !is_chassis_resident("cr-DR-S1")), action=(inport = "DR-S1"; next;) + table=0 (lr_in_admission ), priority=120 , match=(inport == "cr-DR-S2" && !is_chassis_resident("cr-DR-S2")), action=(inport = "DR-S2"; next;) + table=0 (lr_in_admission ), priority=120 , match=(inport == "cr-DR-S3" && !is_chassis_resident("cr-DR-S3")), action=(inport = "DR-S3"; next;) table=0 (lr_in_admission ), priority=50 , match=(eth.dst == 02:ac:10:01:00:01 && inport == "DR-S1" && is_chassis_resident("cr-DR-S1")), action=(xreg0[[0..47]] = 02:ac:10:01:00:01; next;) table=0 (lr_in_admission ), priority=50 , match=(eth.dst == 03:ac:10:01:00:01 && inport == "DR-S2" && is_chassis_resident("cr-DR-S2")), action=(xreg0[[0..47]] = 03:ac:10:01:00:01; next;) table=0 (lr_in_admission ), priority=50 , match=(eth.dst == 04:ac:10:01:00:01 && inport == "DR-S3" && is_chassis_resident("cr-DR-S3")), action=(xreg0[[0..47]] = 04:ac:10:01:00:01; next;) @@ -6539,6 +6542,7 @@ AT_CAPTURE_FILE([lrflows]) # Check the flows in lr_in_admission stage AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl + table=??(lr_in_admission ), priority=120 , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;) table=??(lr_in_admission ), priority=50 , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1" && is_chassis_resident("cr-lrp1")), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) table=??(lr_in_admission ), priority=50 , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) ]) @@ -6560,6 +6564,7 @@ AT_CAPTURE_FILE([lrflows]) # Check the flows in lr_in_admission stage AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl + table=??(lr_in_admission ), priority=120 , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;) table=??(lr_in_admission ), priority=50 , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) table=??(lr_in_admission ), priority=50 , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) ]) @@ -6578,6 +6583,7 @@ AT_CAPTURE_FILE([lrflows]) # Check the flows in lr_in_admission stage AT_CHECK([grep lr_in_admission lrflows | grep lrp1 | sed 's/table=../table=??/' | sort], [0], [dnl + table=??(lr_in_admission ), priority=120 , match=(inport == "cr-lrp1" && !is_chassis_resident("cr-lrp1")), action=(inport = "lrp1"; next;) table=??(lr_in_admission ), priority=50 , match=(eth.dst == 00:00:00:00:00:01 && inport == "lrp1" && is_chassis_resident("cr-lrp1")), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) table=??(lr_in_admission ), priority=50 , match=(eth.mcast && inport == "lrp1"), action=(xreg0[[0..47]] = 00:00:00:00:00:01; next;) ]) -- 2.43.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev