Hello,

Sorry, not a full review but one part bothers me for TSO.

On Mon, 24 Nov 2025 at 21:23, Mike Pattrick via dev
<[email protected]> wrote:
> @@ -9033,13 +9037,138 @@ pmd_send_port_cache_lookup(const struct 
> dp_netdev_pmd_thread *pmd,
>      return tx_port_lookup(&pmd->send_port_cache, port_no);
>  }
>
> +/* Return NULL on no ICMP reply needed. */
> +static struct dp_packet *
> +netdev_generate_frag_needed(struct dp_packet *packet, int send_size, int mtu)
> +{
> +    const struct eth_header *eth;
> +    const void *l3;
> +    size_t l3_len;
> +    bool is_ipv6;
> +
> +    if (send_size < mtu) {
> +        return NULL;
> +    }
> +
> +    eth = dp_packet_eth(packet);
> +    if (!eth) {
> +        return NULL;
> +    }
> +
> +    if (eth->eth_type == htons(ETH_TYPE_IP)) {
> +        is_ipv6 = false;
> +    } else if (eth->eth_type == htons(ETH_TYPE_IPV6)) {
> +        is_ipv6 = true;
> +    } else {
> +        return NULL;
> +    }
> +
> +    l3 = dp_packet_l3(packet);
> +    l3_len = dp_packet_l3_size(packet);
> +
> +    if (is_ipv6) {
> +        const struct ovs_16aligned_ip6_hdr *ip6;
> +        struct in6_addr ip6_src;
> +        struct in6_addr ip6_dst;
> +        size_t max_payload;
> +        const void *l4_buf;
> +        uint8_t nw_proto;
> +        uint8_t nw_frag;
> +        size_t l4_len;
> +
> +        ip6 = (const struct ovs_16aligned_ip6_hdr *) l3;
> +
> +        if (mtu < 1280) {
> +            return NULL;
> +        }

I don't have all the context in mind.
What does 1280 stand for?


> +
> +        if (ipv6_addr_is_multicast(&ip6->ip6_src) ||
> +            ipv6_addr_is_any(&ip6->ip6_src) ||
> +            ipv6_addr_is_loopback(&ip6->ip6_src)) {
> +            return NULL;
> +        }
> +
> +        nw_proto = ip6->ip6_nxt;
> +        l4_buf = ip6 + 1;
> +        l4_len = l3_len - sizeof *ip6;
> +        if (!parse_ipv6_ext_hdrs(&l4_buf, &l4_len, &nw_proto, &nw_frag,
> +                                 NULL, NULL)) {
> +            return NULL;
> +        }
> +
> +        if (nw_frag == FLOW_NW_FRAG_LATER) {
> +            return NULL;
> +        }
> +
> +        if (nw_proto == IPPROTO_ICMPV6) {
> +            const struct icmp6_header *icmp = l4_buf;
> +            if (icmp && packet_icmpv6_is_err(icmp->icmp6_type)) {
> +                return NULL;
> +            }
> +        }
> +
> +        max_payload = 1280 - ETH_HEADER_LEN - IPV6_HEADER_LEN -
> +                      ICMP6_DATA_HEADER_LEN;
> +        l3_len = l3_len < max_payload ? l3_len : max_payload;
> +
> +        memcpy(&ip6_src, &ip6->ip6_dst, sizeof(ip6_dst));
> +        memcpy(&ip6_dst, &ip6->ip6_src, sizeof(ip6_dst));
> +        return compose_ipv6_ptb(eth->eth_dst, eth->eth_src,
> +                                &ip6_dst, &ip6_src,
> +                                htonl(mtu), l3, l3_len);
> +    } else {
> +        const struct ip_header *ip;
> +        size_t icmp_payload_len;
> +        size_t available;
> +
> +        ip = (const struct ip_header *) l3;
> +
> +        if (mtu < 576) {

And here, why 576?


> +            return NULL;
> +        }
> +
> +        if (!(ip->ip_frag_off & htons(IP_DF))) {
> +            return NULL;
> +        }
> +
> +        if (ip_is_multicast(get_16aligned_be32(&ip->ip_src)) ||
> +            ip_is_broadcast(get_16aligned_be32(&ip->ip_src)) ||
> +            ip_is_loopback(get_16aligned_be32(&ip->ip_src))) {
> +            return NULL;
> +        }
> +
> +        if (ip->ip_proto == IPPROTO_ICMP) {
> +            const struct icmp_header *icmp = dp_packet_l4(packet);
> +            if (icmp && packet_icmp_is_err(icmp->icmp_type)) {
> +                return NULL;
> +            }
> +        }
> +        icmp_payload_len = IP_IHL(ip->ip_ihl_ver) * 4 + 
> ICMP_ERROR_DATA_L4_LEN;
> +
> +        available = l3_len;
> +        if (icmp_payload_len > available) {
> +            icmp_payload_len = available;
> +        }
> +
> +        return compose_ipv4_fn(eth->eth_dst, eth->eth_src,
> +                               get_16aligned_be32(&ip->ip_dst),
> +                               get_16aligned_be32(&ip->ip_src),
> +                               htons(mtu), l3, icmp_payload_len);
> +    }
> +}
> +
>  static int
> -push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
> +push_tnl_action(struct dp_netdev_pmd_thread *pmd,
>                  const struct nlattr *attr,
>                  struct dp_packet_batch *batch)
>  {
> -    struct tx_port *tun_port;
> +    size_t i, size = dp_packet_batch_size(batch);
>      const struct ovs_action_push_tnl *data;
> +    uint32_t *depth = recirc_depth_get();
> +    struct dp_packet *packet;
> +    struct tx_port *tun_port;
> +    struct netdev *netdev;
> +    int mtu;
>      int err;
>
>      data = nl_attr_get(attr);
> @@ -9049,7 +9178,45 @@ push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
>          err = -EINVAL;
>          goto error;
>      }
> -    err = netdev_push_header(tun_port->port->netdev, batch, data);
> +
> +    netdev = tun_port->port->netdev;
> +    if (netdev->mtu_user_config &&
> +        netdev_get_mtu(netdev, &mtu) == 0) {
> +        struct dp_packet_batch icmp_batch;
> +
> +        dp_packet_batch_init(&icmp_batch);
> +        DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> +            int len = dp_packet_get_send_len(packet) + data->header_len;

I suspect this will not work with TSO packets:
dp_packet_get_send_len() returns a length before segmentation and it
will not be the actual length of sent data.


> +
> +            struct dp_packet *icmp;
> +
> +            icmp = netdev_generate_frag_needed(packet, len, mtu);
> +            if (!icmp) {
> +                dp_packet_batch_refill(batch, packet, i);
> +                continue;
> +            }
> +
> +            dp_packet_delete(packet);
> +            COVERAGE_INC(datapath_drop_tunnel_mtu_drop);
> +
> +            pkt_metadata_init(&icmp->md, data->tnl_port);
> +
> +            dp_packet_batch_add(&icmp_batch, icmp);
> +        }
> +
> +        if (*depth >= MAX_RECIRC_DEPTH) {
> +            COVERAGE_ADD(datapath_drop_recirc_error,
> +                         dp_packet_batch_size(&icmp_batch));
> +            dp_packet_delete_batch(&icmp_batch, true);
> +        }
> +
> +        if (dp_packet_batch_size(&icmp_batch) > 0) {
> +            (*depth)++;
> +            dp_netdev_recirculate(pmd, &icmp_batch);
> +            (*depth)--;
> +        }
> +    }
> +    err = netdev_push_header(netdev, batch, data);
>      if (!err) {
>          return 0;
>      }


-- 
David Marchand

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to