When multichassis ports attached to localnet switches are forced to use
tunneling to communicate with other ports, MTU of the path between ports
may effectively change (to accommodate tunnel headers or otherwise).

This patch implements Path MTU Discovery for IPv4 and IPv6 for
multichassis ports (both egress and ingress).

This patch adds new flows as follows:

- table 30: store check_pkt_len(mtu)->reg9[1]
- table 37: if reg9[1] is set, then send the packet to controller.

TODO: document behavior.
TODO: dynamically calculate MTU of the interface.
TODO: constrain new flows to communication involving m-chassis ports only.

Signed-off-by: Ihar Hrachyshka <[email protected]>
---
 controller/physical.c | 167 ++++++++++++++++++++++++++++
 tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 419 insertions(+)

diff --git a/controller/physical.c b/controller/physical.c
index 705146316..96c0dc31a 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1072,6 +1072,170 @@ setup_activation_strategy(const struct 
sbrec_port_binding *binding,
     }
 }
 
+static size_t
+encode_start_controller_op(enum action_opcode opcode, bool pause,
+                           uint32_t meter_id, struct ofpbuf *ofpacts)
+{
+    size_t ofs = ofpacts->size;
+
+    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
+    oc->max_len = UINT16_MAX;
+    oc->reason = OFPR_ACTION;
+    oc->pause = pause;
+    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
+        meter_id = NX_CTLR_NO_METER;
+    }
+    oc->meter_id = meter_id;
+
+    struct action_header ah = { .opcode = htonl(opcode) };
+    ofpbuf_put(ofpacts, &ah, sizeof ah);
+
+    return ofs;
+}
+
+static void
+encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
+{
+    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof *oc);
+    ofpacts->header = oc;
+    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
+    ofpact_finish_CONTROLLER(ofpacts, &oc);
+}
+
+static void
+handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
+                            const struct sbrec_port_binding *binding,
+                            bool is_ipv6)
+{
+    uint32_t dp_key = binding->datapath->tunnel_key;
+
+    struct match match;
+    match_init_catchall(&match);
+    match_set_metadata(&match, htonll(dp_key));
+
+    struct ofpbuf ofpacts;
+    ofpbuf_init(&ofpacts, 0);
+
+    /* Store packet too large flag in reg9[1]. */
+    /* TODO: limit to multichassis traffic? */
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_metadata(&match, htonll(dp_key));
+
+    /* TODO: get mtu from interface of the tunnel */
+    uint16_t frag_mtu = 1500;
+    struct ofpact_check_pkt_larger *pkt_larger =
+        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
+    pkt_larger->pkt_len = frag_mtu;
+    pkt_larger->dst.field = mf_from_id(MFF_REG9);
+    pkt_larger->dst.ofs = 1;
+
+    put_resubmit(31, &ofpacts);
+    /* TODO: should we tag table=30 as consumed for this job anywhere? */
+    ofctrl_add_flow(flow_table, 30, 110,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+    ofpbuf_uninit(&ofpacts);
+
+    /* Generate ICMP Fragmentation needed for IP packets that are too large
+     * (reg9[1] == 1) */
+    /* TODO: limit to multichassis traffic? */
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
+
+    /* Return ICMP error with a part of the original IP packet included. */
+    ofpbuf_init(&ofpacts, 0);
+    size_t oc_offset = encode_start_controller_op(
+        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
+
+    /* Before sending the ICMP error packet back to the pipeline, set a number
+     * of fields. */
+    struct ofpbuf inner_ofpacts;
+    ofpbuf_init(&inner_ofpacts, 0);
+
+    /* The new error packet is no longer too large */
+    /* REGBIT_PKT_LARGER = 0 */
+    ovs_be32 value = htonl(0);
+    ovs_be32 mask = htonl(1 << 1);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
+
+    /* The new error packet is delivered locally */
+    /* REGBIT_EGRESS_LOOPBACK = 1 */
+    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
+
+    /* eth.dst */
+    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.dst */
+    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+              ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+              ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.ttl */
+    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
+    ip_ttl->ttl = 255;
+
+    if (is_ipv6) {
+        /* icmp6.type = 2 (Packet Too Big) */
+        /* icmp6.code = 0 */
+        uint8_t icmp_type = 2;
+        uint8_t icmp_code = 0;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code, NULL);
+
+        /* icmp6.frag_mtu */
+        size_t frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+    } else {
+        /* icmp4.type = 3 (Destination Unreachable) */
+        /* icmp4.code = 4 (Fragmentation Needed) */
+        uint8_t icmp_type = 3;
+        uint8_t icmp_code = 4;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code, NULL);
+
+        /* icmp4.frag_mtu = */
+        size_t frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+    }
+
+    /* Finally, submit the ICMP error back to the ingress pipeline */
+    put_resubmit(8, &inner_ofpacts);
+
+    /* Attach nested actions to ICMP error controller handler */
+    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
+                                 &ofpacts, OFP15_VERSION);
+
+    /* Finalize the ICMP error controller handler */
+    encode_finish_controller_op(oc_offset, &ofpacts);
+
+    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+
+    ofpbuf_uninit(&inner_ofpacts);
+    ofpbuf_uninit(&ofpacts);
+}
+
 static void
 enforce_tunneling_for_multichassis_ports(
     struct local_datapath *ld,
@@ -1124,6 +1288,9 @@ enforce_tunneling_for_multichassis_ports(
                         binding->header_.uuid.parts[0], &match, &ofpacts,
                         &binding->header_.uuid);
         ofpbuf_uninit(&ofpacts);
+
+        handle_oversized_ip_packets(flow_table, binding, false);
+        handle_oversized_ip_packets(flow_table, binding, true);
     }
 
     struct tunnel *tun_elem;
diff --git a/tests/ovn.at b/tests/ovn.at
index f8b8db4df..91ecd0814 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14887,6 +14887,258 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([localnet connectivity with multiple requested-chassis, path mtu 
discovery])
+AT_KEYWORDS([multi-chassis])
+ovn_start
+
+net_add n1
+for i in 1 2; do
+    sim_add hv$i
+    as hv$i
+    check ovs-vsctl add-br br-phys
+    ovn_attach n1 br-phys 192.168.0.$i
+    check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+done
+
+check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config 
vlan-passthru=true
+check ovn-nbctl lsp-add ls0 first
+check ovn-nbctl lsp-add ls0 second
+check ovn-nbctl lsp-add ls0 migrator
+check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1 ff::01"
+check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2 ff::02"
+check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff 10.0.0.100 
ff::ff"
+
+check ovn-nbctl lsp-add ls0 public
+check ovn-nbctl lsp-set-type public localnet
+check ovn-nbctl lsp-set-addresses public unknown
+check ovn-nbctl lsp-set-options public network_name=phys
+
+# TODO: do we need mtu request here?
+check ovn-nbctl lsp-set-options first requested-chassis=hv1 
vif-plug-mtu-request=1500
+check ovn-nbctl lsp-set-options second requested-chassis=hv2 
vif-plug-mtu-request=1500
+check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2 
vif-plug-mtu-request=1500
+
+as hv1 check ovs-vsctl -- add-port br-int first -- \
+    set Interface first external-ids:iface-id=first \
+    options:tx_pcap=hv1/first-tx.pcap \
+    options:rxq_pcap=hv1/first-rx.pcap \
+    ofport-request=1
+as hv2 check ovs-vsctl -- add-port br-int second -- \
+    set Interface second external-ids:iface-id=second \
+    options:tx_pcap=hv2/second-tx.pcap \
+    options:rxq_pcap=hv2/second-rx.pcap \
+    ofport-request=2
+
+# Create Migrator interfaces on both hv1 and hv2
+for hv in hv1 hv2; do
+    as $hv check ovs-vsctl -- add-port br-int migrator -- \
+        set Interface migrator external-ids:iface-id=migrator \
+        options:tx_pcap=$hv/migrator-tx.pcap \
+        options:rxq_pcap=$hv/migrator-rx.pcap \
+        ofport-request=100
+done
+
+send_ip_packet() {
+    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6 
ip_chksum=$7 data=$8 ip_len=$9
+    local ip_ttl=ff
+    local 
packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
+    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
+    echo $packet
+}
+
+send_ip6_packet() {
+    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6 
data=$7 ip_len=$8
+    local ip_ttl=ff
+    local 
packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
+    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
+    echo $packet
+}
+
+reset_env() {
+    as hv1 reset_pcap_file first hv1/first
+    as hv2 reset_pcap_file second hv2/second
+    as hv1 reset_pcap_file migrator hv1/migrator
+    as hv2 reset_pcap_file migrator hv2/migrator
+
+    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
+        : > $port.expected
+    done
+}
+
+first_mac=000000000001
+second_mac=000000000002
+migrator_mac=0000000000ff
+first_ip=$(ip_to_hex 10 0 0 1)
+second_ip=$(ip_to_hex 10 0 0 2)
+migrator_ip=$(ip_to_hex 10 0 0 100)
+first_ip6=ff000000000000000000000000000001
+second_ip6=ff000000000000000000000000000002
+migrator_ip6=ff0000000000000000000000000000ff
+
+OVN_POPULATE_ARP
+
+reset_env
+
+# Check that packets of proper size are delivered from multichassis to 
intended ports
+ip_len=058c
+len=1400
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip 
$first_ip 0000 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip 
$second_ip 0000 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac $migrator_ip6 
$first_ip6 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac $migrator_ip6 
$second_ip6 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], 
[hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], 
[hv2/migrator.expected])
+
+reset_env
+
+# Now check that oversized packets are not delivered to the intended ports, and
+# that the originating port receives an ICMP error suggesting fragmentation
+
+ip_len=0bcc
+len=3000
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip 
$first_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip 
$second_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac $migrator_ip6 
$first_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac $migrator_ip6 
$second_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+
+# it's harder to construct the original packet that includes geneve headers
+# etc. so check manually
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap > 
hv1/migrator.packets
+sed -i '/ffffffffffff/d' hv1/migrator.packets
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap > 
hv2/migrator.packets
+sed -i '/ffffffffffff/d' hv2/migrator.packets
+
+# icmp errors are delivered only to the port binding that originated the 
offending packet
+AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
+])
+AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
+])
+
+for line in $(cat hv1/migrator.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+reset_env
+
+# Check that packets of proper size are delivered from regular to multichassis 
ports
+ip_len=058c
+len=1400
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip 
$migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip 
$migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6 
$migrator_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6 
$migrator_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], 
[hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], 
[hv2/migrator.expected])
+
+reset_env
+
+# Now check that oversized packets are not delivered to multichassis ports, and
+# that the originating ports receive an ICMP error suggesting fragmentation
+
+ip_len=0bcc
+len=3000
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip 
$migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip 
$migrator_ip 0000 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6 
$migrator_ip6 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6 
$migrator_ip6 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], 
[hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], 
[hv2/migrator.expected])
+
+# it's harder to construct the original packet that includes geneve headers
+# etc. so check manually
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap > 
hv1/first.packets
+sed -i '/ffffffffffff/d' hv1/first.packets
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap > 
hv2/second.packets
+sed -i '/ffffffffffff/d' hv2/second.packets
+
+AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
+])
+AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
+])
+
+for line in $(cat hv1/first.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+for line in $(cat hv2/second.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+# todo: test multichassis to multichassis
+
+OVN_CLEANUP([hv1],[hv2])
+
+AT_CLEANUP
+])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([options:activation-strategy for logical port])
 AT_KEYWORDS([multi-chassis])
-- 
2.34.3

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to