Add support in the userspace datapath for PATH MTU on tunnel interfaces. This feature allows users to configure an MTU on tunnel ports. If set, when the userspace datapath attempts to encapsulate a packet that exceeds the tunnels MTU, OVS will generate and send an ICMP Fragmentation Needed or Packet Too Big message back to the source host.
If an MTU is not set on the tunnel interface, there is no change in behaviour. Reported-at: https://issues.redhat.com/browse/FDP-256 Signed-off-by: Mike Pattrick <[email protected]> --- v2: Correct counter in unit test --- NEWS | 2 + lib/dpif-netdev.c | 106 +++++++++++++++++++++++++++++++++- lib/netdev-vport-private.h | 1 + lib/netdev-vport.c | 29 ++++++++++ lib/packets.c | 96 ++++++++++++++++++++++++++++++ lib/packets.h | 11 ++++ ofproto/ofproto-dpif.c | 11 ++-- tests/tunnel-push-pop-ipv6.at | 69 ++++++++++++++++++++++ tests/tunnel-push-pop.at | 66 +++++++++++++++++++++ 9 files changed, 382 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index f9a74df1a..679c02694 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,8 @@ Post-v3.6.0 - Userspace datapath: * Conntrack now supports the FTP commands EPSV and EPRT with IPv4 connections, instead of limiting these commands to IPv6 only. + * MTU can now be set on tunnel interfaces with the mtu_request + parameters. - DPDK: * OVS validated with DPDK 24.11.3. - OVSDB-IDL: diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 224ce7086..0862cb998 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -114,6 +114,7 @@ COVERAGE_DEFINE(datapath_drop_upcall_error); COVERAGE_DEFINE(datapath_drop_lock_error); COVERAGE_DEFINE(datapath_drop_userspace_action_error); COVERAGE_DEFINE(datapath_drop_tunnel_push_error); +COVERAGE_DEFINE(datapath_drop_tunnel_mtu_drop); COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); COVERAGE_DEFINE(datapath_drop_recirc_error); COVERAGE_DEFINE(datapath_drop_invalid_port); @@ -9033,13 +9034,78 @@ pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, return tx_port_lookup(&pmd->send_port_cache, port_no); } +static struct dp_packet * +netdev_generate_icmp_frag_needed(struct dp_packet *packet, int mtu) +{ + const struct eth_header *eth; + const void *l3; + size_t l3_len; + bool is_ipv6; + + eth = dp_packet_eth(packet); + if (!eth) { + return NULL; + } + + is_ipv6 = (eth->eth_type == htons(ETH_TYPE_IPV6)); + + if (!is_ipv6 && eth->eth_type != htons(ETH_TYPE_IP)) { + return NULL; + } + + l3 = dp_packet_l3(packet); + l3_len = dp_packet_l3_size(packet); + + if (is_ipv6) { + const struct ovs_16aligned_ip6_hdr *ip6; + struct in6_addr ip6_src; + struct in6_addr ip6_dst; + size_t max_payload; + + ip6 = (const struct ovs_16aligned_ip6_hdr *) l3; + + max_payload = 1280 - ETH_HEADER_LEN - IPV6_HEADER_LEN - + ICMP6_DATA_HEADER_LEN; + l3_len = l3_len < max_payload ? l3_len : max_payload; + + memcpy(&ip6_src, &ip6->ip6_dst, sizeof(ip6_dst)); + memcpy(&ip6_dst, &ip6->ip6_src, sizeof(ip6_dst)); + return compose_ipv6_ptb(eth->eth_dst, eth->eth_src, + &ip6_dst, &ip6_src, + htonl(mtu), l3, l3_len); + + } else { + const struct ip_header *ip; + size_t icmp_payload_len; + size_t available; + + ip = (const struct ip_header *) l3; + icmp_payload_len = IP_IHL(ip->ip_ihl_ver) * 4 + ICMP_ERROR_DATA_L4_LEN; + + available = l3_len; + if (icmp_payload_len > available) { + icmp_payload_len = available; + } + + return compose_ipv4_fn(eth->eth_dst, eth->eth_src, + get_16aligned_be32(&ip->ip_dst), + get_16aligned_be32(&ip->ip_src), + htons(mtu), l3, icmp_payload_len); + } +} + static int -push_tnl_action(const struct dp_netdev_pmd_thread *pmd, +push_tnl_action(struct dp_netdev_pmd_thread *pmd, const struct nlattr *attr, struct dp_packet_batch *batch) { - struct tx_port *tun_port; + size_t i, size = dp_packet_batch_size(batch); const struct ovs_action_push_tnl *data; + uint32_t *depth = recirc_depth_get(); + struct dp_packet *packet; + struct tx_port *tun_port; + struct netdev *netdev; + int mtu; int err; data = nl_attr_get(attr); @@ -9049,7 +9115,41 @@ push_tnl_action(const struct dp_netdev_pmd_thread *pmd, err = -EINVAL; goto error; } - err = netdev_push_header(tun_port->port->netdev, batch, data); + + netdev = tun_port->port->netdev; + if (netdev->mtu_user_config && + netdev_get_mtu(netdev, &mtu) == 0 && + *depth < MAX_RECIRC_DEPTH) { + struct dp_packet_batch icmp_batch; + + dp_packet_batch_init(&icmp_batch); + DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { + int len = dp_packet_get_send_len(packet) + data->header_len; + if (mtu < len) { + struct dp_packet *icmp; + + COVERAGE_INC(datapath_drop_tunnel_mtu_drop); + icmp = netdev_generate_icmp_frag_needed(packet, mtu); + dp_packet_delete(packet); + + if (!icmp) { + continue; + } + + pkt_metadata_init(&icmp->md, data->tnl_port); + + dp_packet_batch_add(&icmp_batch, icmp); + } else { + dp_packet_batch_refill(batch, packet, i); + } + } + if (dp_packet_batch_size(&icmp_batch) > 0) { + (*depth)++; + dp_netdev_recirculate(pmd, &icmp_batch); + (*depth)--; + } + } + err = netdev_push_header(netdev, batch, data); if (!err) { return 0; } diff --git a/lib/netdev-vport-private.h b/lib/netdev-vport-private.h index 586231057..82ccbd8cc 100644 --- a/lib/netdev-vport-private.h +++ b/lib/netdev-vport-private.h @@ -42,6 +42,7 @@ struct netdev_vport { /* Tunnels. */ char egress_iface[IFNAMSIZ]; bool carrier_status; + int mtu; /* Patch Ports. */ char *peer; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 033f9a6fd..e4016187b 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -101,6 +101,33 @@ get_netdev_tunnel_config(const struct netdev *netdev) return vport_tunnel_config(netdev_vport_cast(netdev)); } +static int +get_tunnel_mtu(const struct netdev *netdev, int *mtup) +{ + struct netdev_vport *dev = netdev_vport_cast(netdev); + + ovs_mutex_lock(&dev->mutex); + *mtup = dev->mtu; + ovs_mutex_unlock(&dev->mutex); + + return 0; +} + +static int +set_tunnel_mtu(struct netdev *netdev, int mtu) +{ + struct netdev_vport *dev = netdev_vport_cast(netdev); + + ovs_mutex_lock(&dev->mutex); + if (dev->mtu != mtu) { + dev->mtu = mtu; + } + ovs_mutex_unlock(&dev->mutex); + netdev_change_seq_changed(netdev); + + return 0; +} + bool netdev_vport_is_patch(const struct netdev *netdev) { @@ -1265,6 +1292,8 @@ netdev_vport_get_ifindex(const struct netdev *netdev_) .get_config = get_tunnel_config, \ .set_config = set_tunnel_config, \ .get_tunnel_config = get_netdev_tunnel_config, \ + .get_mtu = get_tunnel_mtu, \ + .set_mtu = set_tunnel_mtu, \ .get_status = tunnel_get_status void diff --git a/lib/packets.c b/lib/packets.c index 0c1f72e48..0d67dbbec 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1783,6 +1783,102 @@ compose_ipv6(struct dp_packet *packet, uint8_t proto, return data; } +/* Compose an ICMP Fragmentation Needed message. */ +struct dp_packet * +compose_ipv4_fn(const struct eth_addr eth_src, + const struct eth_addr eth_dst, + const ovs_be32 ip_src, + const ovs_be32 ip_dst, + ovs_be16 mtu, const void *body, size_t body_len) +{ + struct icmp_header *icmp; + struct ip_header *ip; + struct dp_packet *b; + + b = dp_packet_new(ETH_HEADER_LEN + IP_HEADER_LEN + + ICMP_HEADER_LEN + body_len); + if (!b) { + return NULL; + } + + ip = (struct ip_header *) eth_compose(b, eth_dst, eth_src, ETH_TYPE_IP, + IP_HEADER_LEN); + + ip->ip_ihl_ver = IP_IHL_VER(5, IP_VERSION); + ip->ip_tos = 0xc0; + ip->ip_tot_len = htons(IP_HEADER_LEN + ICMP_HEADER_LEN + body_len); + ip->ip_id = 0; + ip->ip_frag_off = 0; + ip->ip_ttl = 64; + ip->ip_proto = IPPROTO_ICMP; + ip->ip_csum = 0; + + put_16aligned_be32(&ip->ip_src, ip_src); + put_16aligned_be32(&ip->ip_dst, ip_dst); + ip->ip_csum = csum(ip, IP_HEADER_LEN); + + icmp = (struct icmp_header *) dp_packet_put_zeros(b, ICMP_HEADER_LEN + + body_len); + + icmp->icmp_type = ICMP4_DST_UNREACH; + icmp->icmp_code = 4; + icmp->icmp_csum = 0; + + icmp->icmp_fields.frag.mtu = mtu; + + if (body && body_len) { + void *payload = (void *)(icmp + 1); + memcpy(payload, body, body_len); + } + + uint32_t csum_val = csum_continue(0, icmp, ICMP_HEADER_LEN + body_len); + icmp->icmp_csum = csum_finish(csum_val); + + dp_packet_set_l3(b, ip); + dp_packet_set_l4(b, icmp); + + return b; +} + +/* Compose an ICMP Packet Too Big message. */ +struct dp_packet * +compose_ipv6_ptb(const struct eth_addr eth_src, + const struct eth_addr eth_dst, + const struct in6_addr *ipv6_src, + const struct in6_addr *ipv6_dst, + ovs_be32 mtu, const void *body, size_t body_len) +{ + struct dp_packet *b; + struct icmp6_data_header *icmp6; + + b = dp_packet_new(ETH_HEADER_LEN + IPV6_HEADER_LEN + + ICMP6_DATA_HEADER_LEN + body_len); + if (!b) { + return NULL; + } + + eth_compose(b, eth_dst, eth_src, ETH_TYPE_IPV6, IPV6_HEADER_LEN); + + icmp6 = compose_ipv6(b, IPPROTO_ICMPV6, ipv6_src, ipv6_dst, + 0, 0, 255, ICMP6_DATA_HEADER_LEN + body_len); + + icmp6->icmp6_base.icmp6_type = ICMP6_PACKET_TOO_BIG; + icmp6->icmp6_base.icmp6_code = 0; + icmp6->icmp6_base.icmp6_cksum = 0; + put_16aligned_be32(&icmp6->icmp6_data.be32[0], mtu); + if (body && body_len) { + void *payload = (void *)(icmp6 + 1); + memcpy(payload, body, body_len); + } + + uint32_t icmp_csum = packet_csum_pseudoheader6(dp_packet_l3(b)); + icmp6->icmp6_base.icmp6_cksum = csum_finish( + csum_continue(icmp_csum, icmp6, ICMP6_DATA_HEADER_LEN + body_len)); + + dp_packet_set_l4(b, icmp6); + return b; +} + /* Compose an IPv6 Neighbor Discovery Neighbor Solicitation message. */ void compose_nd_ns(struct dp_packet *b, const struct eth_addr eth_src, diff --git a/lib/packets.h b/lib/packets.h index ed46778fe..987bb72c6 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1667,6 +1667,17 @@ void compose_arp(struct dp_packet *, uint16_t arp_op, const struct eth_addr arp_sha, const struct eth_addr arp_tha, bool broadcast, ovs_be32 arp_spa, ovs_be32 arp_tpa); +struct dp_packet *compose_ipv6_ptb(const struct eth_addr eth_src, + const struct eth_addr eth_dst, + const struct in6_addr *ipv6_src, + const struct in6_addr *ipv6_dst, + ovs_be32 mtu, const void *body, + size_t body_len); +struct dp_packet *compose_ipv4_fn(const struct eth_addr eth_src, + const struct eth_addr eth_dst, + ovs_be32 ip_src, ovs_be32 ip_dst, + ovs_be16 mtu, const void *body, + size_t body_len); void compose_nd_ns(struct dp_packet *, const struct eth_addr eth_src, const struct in6_addr *ipv6_src, const struct in6_addr *ipv6_dst); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index b62ad5842..e6b1fc4d9 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -2283,12 +2283,11 @@ port_construct(struct ofport *port_) dpif_port_destroy(&dpif_port); return EBUSY; } - - ovs_rwlock_wrlock(&ofproto->backer->odp_to_ofport_lock); - hmap_insert(&ofproto->backer->odp_to_ofport_map, &port->odp_port_node, - hash_odp_port(port->odp_port)); - ovs_rwlock_unlock(&ofproto->backer->odp_to_ofport_lock); } + ovs_rwlock_wrlock(&ofproto->backer->odp_to_ofport_lock); + hmap_insert(&ofproto->backer->odp_to_ofport_map, &port->odp_port_node, + hash_odp_port(port->odp_port)); + ovs_rwlock_unlock(&ofproto->backer->odp_to_ofport_lock); dpif_port_destroy(&dpif_port); if (ofproto->sflow) { @@ -2350,7 +2349,7 @@ port_destruct(struct ofport *port_, bool del) port->peer = NULL; } - if (port->odp_port != ODPP_NONE && !port->is_tunnel) { + if (port->odp_port != ODPP_NONE) { ovs_rwlock_wrlock(&ofproto->backer->odp_to_ofport_lock); hmap_remove(&ofproto->backer->odp_to_ofport_map, &port->odp_port_node); ovs_rwlock_unlock(&ofproto->backer->odp_to_ofport_lock); diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index ca5cb4d19..505f634f7 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -940,3 +940,72 @@ AT_CHECK([grep -q "GENEVE_ACT" stdout]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop_ipv6 - tunnel ICMPv6 fragmentation needed]) +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t1 \ + -- set Interface t1 type=geneve \ + options:remote_ip=2001:cafe::92 \ + options:key=123 ofport_request=2 \ + options:df_default=true mtu_request=1400]) + +dnl Setup an IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local +]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +dnl This Neighbor Advertisement from p0 has two effects: +dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl + ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl + icmpv6(type=136,code=0),dnl + nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)' +]) + +AT_CHECK([ovs-vsctl -- set Interface int-br options:pcap=int-br.pcap]) + +dnl Verify MTU is set on tunnel interface +AT_CHECK([ovs-vsctl get interface t1 mtu], [0], [dnl +1400 +]) + +AT_CHECK([ovs-ofctl add-flow int-br "priority=100,in_port=LOCAL,actions=output:2"]) +AT_CHECK([ovs-ofctl add-flow int-br "priority=1,actions=normal"]) + +dnl Send a 1500 byte packet, exceeding the tunnel's MTU. +zeros1458=$(printf '0%.0s' $(seq 2916)) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br ]dnl +[aa55aa550000f8bc1244cafe86dd]dnl +[6000000005ba11402001cafe0000000000000000000000922001cafe000000000000000000000088]dnl +[c853003a05ba54d3]dnl +[${zeros1458}]) + +ovs-appctl time/warp 1000 + +dnl Verify that tunnel drop occured. +AT_CHECK([ovs-appctl coverage/read-counter datapath_drop_tunnel_mtu_drop], [0], [1 +]) + +dnl Check for ICMP Fragmentation Needed. +AT_CHECK([ovs-pcap int-br.pcap 2>/dev/null | grep -q ]dnl +[f8bc1244cafeaa55aa55000086dd]dnl +[6000000004ca3aff2001cafe0000000000000000000000922001cafe000000000000000000000088]dnl +[0200ab3a00000578]dnl +[6000000005ba11402001cafe0000000000000000000000922001cafe000000000000000000000088]dnl +[c853003a05ba54d3], [0]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index a87ae3313..7a70427de 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -1459,3 +1459,69 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop - tunnel ICMP fragmentation needed]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-appctl vlog/set netdev_vport:dbg]) +AT_CHECK([ovs-appctl vlog/set native_tnl:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-port int-br t1 -- set Interface t1 type=geneve \ + options:remote_ip=1.1.2.92 options:key=123 ofport_request=2 \ + options:df_default=true mtu_request=1400]) + +dnl Setup dummy interface IP addresses. +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr int-br 10.0.0.1/24], [0], [OK +]) + +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 10.0.0.0/24 dev int-br SRC 10.0.0.1 local +]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +dnl This ARP reply from p0 has two effects: +dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 ']dnl +[recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),]dnl +[arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)']) + +AT_CHECK([ovs-vsctl -- set Interface int-br options:pcap=int-br.pcap]) + +dnl Verify MTU is set on tunnel interface +AT_CHECK([ovs-vsctl get interface t1 mtu], [0], [dnl +1400 +]) + +AT_CHECK([ovs-ofctl add-flow int-br "priority=100,in_port=LOCAL,actions=output:2"]) +AT_CHECK([ovs-ofctl add-flow int-br "priority=1,actions=normal"]) + +dnl Send a 1500 byte packet, exceeding the tunnel's MTU. +zeros1458=$(printf '0%.0s' $(seq 2916)) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br ]dnl +[50540000000a5054000000090800]dnl +[450005b6000100004001f9ee0a0000020a000001]dnl +[08009abc00010001]dnl +[${zeros1458}]) + +ovs-appctl time/warp 1000 + +dnl Verify that tunnel drop occured. +AT_CHECK([ovs-appctl coverage/read-counter datapath_drop_tunnel_mtu_drop], [0], [1 +]) + +dnl Check for ICMP Fragmentation Needed. +AT_CHECK([ovs-pcap int-br.pcap 2>/dev/null | grep -q ]dnl +[50540000000950540000000a0800]dnl +[45c0003800000000400166030a0000010a000002]dnl +[0304bc1a00000578]dnl +[450005b6000100004001f9ee0a0000020a000001]dnl +[08009abc00010001], [0]) + +OVS_VSWITCHD_STOP +AT_CLEANUP -- 2.51.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
