On Tue, Nov 25, 2025 at 3:32 AM David Marchand <[email protected]>
wrote:

> Hello,
>
> Sorry, not a full review but one part bothers me for TSO.
>
> On Mon, 24 Nov 2025 at 21:23, Mike Pattrick via dev
> <[email protected]> wrote:
> > @@ -9033,13 +9037,138 @@ pmd_send_port_cache_lookup(const struct
> dp_netdev_pmd_thread *pmd,
> >      return tx_port_lookup(&pmd->send_port_cache, port_no);
> >  }
> >
> > +/* Return NULL on no ICMP reply needed. */
> > +static struct dp_packet *
> > +netdev_generate_frag_needed(struct dp_packet *packet, int send_size,
> int mtu)
> > +{
> > +    const struct eth_header *eth;
> > +    const void *l3;
> > +    size_t l3_len;
> > +    bool is_ipv6;
> > +
> > +    if (send_size < mtu) {
> > +        return NULL;
> > +    }
> > +
> > +    eth = dp_packet_eth(packet);
> > +    if (!eth) {
> > +        return NULL;
> > +    }
> > +
> > +    if (eth->eth_type == htons(ETH_TYPE_IP)) {
> > +        is_ipv6 = false;
> > +    } else if (eth->eth_type == htons(ETH_TYPE_IPV6)) {
> > +        is_ipv6 = true;
> > +    } else {
> > +        return NULL;
> > +    }
> > +
> > +    l3 = dp_packet_l3(packet);
> > +    l3_len = dp_packet_l3_size(packet);
> > +
> > +    if (is_ipv6) {
> > +        const struct ovs_16aligned_ip6_hdr *ip6;
> > +        struct in6_addr ip6_src;
> > +        struct in6_addr ip6_dst;
> > +        size_t max_payload;
> > +        const void *l4_buf;
> > +        uint8_t nw_proto;
> > +        uint8_t nw_frag;
> > +        size_t l4_len;
> > +
> > +        ip6 = (const struct ovs_16aligned_ip6_hdr *) l3;
> > +
> > +        if (mtu < 1280) {
> > +            return NULL;
> > +        }
>
> I don't have all the context in mind.
> What does 1280 stand for?
>
>
> > +
> > +        if (ipv6_addr_is_multicast(&ip6->ip6_src) ||
> > +            ipv6_addr_is_any(&ip6->ip6_src) ||
> > +            ipv6_addr_is_loopback(&ip6->ip6_src)) {
> > +            return NULL;
> > +        }
> > +
> > +        nw_proto = ip6->ip6_nxt;
> > +        l4_buf = ip6 + 1;
> > +        l4_len = l3_len - sizeof *ip6;
> > +        if (!parse_ipv6_ext_hdrs(&l4_buf, &l4_len, &nw_proto, &nw_frag,
> > +                                 NULL, NULL)) {
> > +            return NULL;
> > +        }
> > +
> > +        if (nw_frag == FLOW_NW_FRAG_LATER) {
> > +            return NULL;
> > +        }
> > +
> > +        if (nw_proto == IPPROTO_ICMPV6) {
> > +            const struct icmp6_header *icmp = l4_buf;
> > +            if (icmp && packet_icmpv6_is_err(icmp->icmp6_type)) {
> > +                return NULL;
> > +            }
> > +        }
> > +
> > +        max_payload = 1280 - ETH_HEADER_LEN - IPV6_HEADER_LEN -
> > +                      ICMP6_DATA_HEADER_LEN;
> > +        l3_len = l3_len < max_payload ? l3_len : max_payload;
> > +
> > +        memcpy(&ip6_src, &ip6->ip6_dst, sizeof(ip6_dst));
> > +        memcpy(&ip6_dst, &ip6->ip6_src, sizeof(ip6_dst));
> > +        return compose_ipv6_ptb(eth->eth_dst, eth->eth_src,
> > +                                &ip6_dst, &ip6_src,
> > +                                htonl(mtu), l3, l3_len);
> > +    } else {
> > +        const struct ip_header *ip;
> > +        size_t icmp_payload_len;
> > +        size_t available;
> > +
> > +        ip = (const struct ip_header *) l3;
> > +
> > +        if (mtu < 576) {
>
> And here, why 576?
>

The 1280 above and 576 here are RFC specified minimum mtu's, both of these
checks mirror how pmtu is implemented in the linux kernel.


>
> > +            return NULL;
> > +        }
> > +
> > +        if (!(ip->ip_frag_off & htons(IP_DF))) {
> > +            return NULL;
> > +        }
> > +
> > +        if (ip_is_multicast(get_16aligned_be32(&ip->ip_src)) ||
> > +            ip_is_broadcast(get_16aligned_be32(&ip->ip_src)) ||
> > +            ip_is_loopback(get_16aligned_be32(&ip->ip_src))) {
> > +            return NULL;
> > +        }
> > +
> > +        if (ip->ip_proto == IPPROTO_ICMP) {
> > +            const struct icmp_header *icmp = dp_packet_l4(packet);
> > +            if (icmp && packet_icmp_is_err(icmp->icmp_type)) {
> > +                return NULL;
> > +            }
> > +        }
> > +        icmp_payload_len = IP_IHL(ip->ip_ihl_ver) * 4 +
> ICMP_ERROR_DATA_L4_LEN;
> > +
> > +        available = l3_len;
> > +        if (icmp_payload_len > available) {
> > +            icmp_payload_len = available;
> > +        }
> > +
> > +        return compose_ipv4_fn(eth->eth_dst, eth->eth_src,
> > +                               get_16aligned_be32(&ip->ip_dst),
> > +                               get_16aligned_be32(&ip->ip_src),
> > +                               htons(mtu), l3, icmp_payload_len);
> > +    }
> > +}
> > +
> >  static int
> > -push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
> > +push_tnl_action(struct dp_netdev_pmd_thread *pmd,
> >                  const struct nlattr *attr,
> >                  struct dp_packet_batch *batch)
> >  {
> > -    struct tx_port *tun_port;
> > +    size_t i, size = dp_packet_batch_size(batch);
> >      const struct ovs_action_push_tnl *data;
> > +    uint32_t *depth = recirc_depth_get();
> > +    struct dp_packet *packet;
> > +    struct tx_port *tun_port;
> > +    struct netdev *netdev;
> > +    int mtu;
> >      int err;
> >
> >      data = nl_attr_get(attr);
> > @@ -9049,7 +9178,45 @@ push_tnl_action(const struct dp_netdev_pmd_thread
> *pmd,
> >          err = -EINVAL;
> >          goto error;
> >      }
> > -    err = netdev_push_header(tun_port->port->netdev, batch, data);
> > +
> > +    netdev = tun_port->port->netdev;
> > +    if (netdev->mtu_user_config &&
> > +        netdev_get_mtu(netdev, &mtu) == 0) {
> > +        struct dp_packet_batch icmp_batch;
> > +
> > +        dp_packet_batch_init(&icmp_batch);
> > +        DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> > +            int len = dp_packet_get_send_len(packet) + data->header_len;
>
> I suspect this will not work with TSO packets:
> dp_packet_get_send_len() returns a length before segmentation and it
> will not be the actual length of sent data.
>

I believe you are correct, also I think this is the second misuse of
dp_packet_get_send_len() - out of three current uses - in a few months.
Maybe that function should be renamed for clarity?

As a side note, I was looking at the kernel module and unless I'm reading
it wrong, queue_gso_packets() appears to segment the packet before applying
cutlen whereas other areas appear to apply cutlen before segmentation. I
don't know if that's intentional.

I can send in a new version that appropriately accounts for seg size.


Thanks,
M


>
>
> > +
> > +            struct dp_packet *icmp;
> > +
> > +            icmp = netdev_generate_frag_needed(packet, len, mtu);
> > +            if (!icmp) {
> > +                dp_packet_batch_refill(batch, packet, i);
> > +                continue;
> > +            }
> > +
> > +            dp_packet_delete(packet);
> > +            COVERAGE_INC(datapath_drop_tunnel_mtu_drop);
> > +
> > +            pkt_metadata_init(&icmp->md, data->tnl_port);
> > +
> > +            dp_packet_batch_add(&icmp_batch, icmp);
> > +        }
> > +
> > +        if (*depth >= MAX_RECIRC_DEPTH) {
> > +            COVERAGE_ADD(datapath_drop_recirc_error,
> > +                         dp_packet_batch_size(&icmp_batch));
> > +            dp_packet_delete_batch(&icmp_batch, true);
> > +        }
> > +
> > +        if (dp_packet_batch_size(&icmp_batch) > 0) {
> > +            (*depth)++;
> > +            dp_netdev_recirculate(pmd, &icmp_batch);
> > +            (*depth)--;
> > +        }
> > +    }
> > +    err = netdev_push_header(netdev, batch, data);
> >      if (!err) {
> >          return 0;
> >      }
>
>
> --
> David Marchand
>
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to