On Tue, Nov 25, 2025 at 3:32 AM David Marchand <[email protected]> wrote:
> Hello, > > Sorry, not a full review but one part bothers me for TSO. > > On Mon, 24 Nov 2025 at 21:23, Mike Pattrick via dev > <[email protected]> wrote: > > @@ -9033,13 +9037,138 @@ pmd_send_port_cache_lookup(const struct > dp_netdev_pmd_thread *pmd, > > return tx_port_lookup(&pmd->send_port_cache, port_no); > > } > > > > +/* Return NULL on no ICMP reply needed. */ > > +static struct dp_packet * > > +netdev_generate_frag_needed(struct dp_packet *packet, int send_size, > int mtu) > > +{ > > + const struct eth_header *eth; > > + const void *l3; > > + size_t l3_len; > > + bool is_ipv6; > > + > > + if (send_size < mtu) { > > + return NULL; > > + } > > + > > + eth = dp_packet_eth(packet); > > + if (!eth) { > > + return NULL; > > + } > > + > > + if (eth->eth_type == htons(ETH_TYPE_IP)) { > > + is_ipv6 = false; > > + } else if (eth->eth_type == htons(ETH_TYPE_IPV6)) { > > + is_ipv6 = true; > > + } else { > > + return NULL; > > + } > > + > > + l3 = dp_packet_l3(packet); > > + l3_len = dp_packet_l3_size(packet); > > + > > + if (is_ipv6) { > > + const struct ovs_16aligned_ip6_hdr *ip6; > > + struct in6_addr ip6_src; > > + struct in6_addr ip6_dst; > > + size_t max_payload; > > + const void *l4_buf; > > + uint8_t nw_proto; > > + uint8_t nw_frag; > > + size_t l4_len; > > + > > + ip6 = (const struct ovs_16aligned_ip6_hdr *) l3; > > + > > + if (mtu < 1280) { > > + return NULL; > > + } > > I don't have all the context in mind. > What does 1280 stand for? > > > > + > > + if (ipv6_addr_is_multicast(&ip6->ip6_src) || > > + ipv6_addr_is_any(&ip6->ip6_src) || > > + ipv6_addr_is_loopback(&ip6->ip6_src)) { > > + return NULL; > > + } > > + > > + nw_proto = ip6->ip6_nxt; > > + l4_buf = ip6 + 1; > > + l4_len = l3_len - sizeof *ip6; > > + if (!parse_ipv6_ext_hdrs(&l4_buf, &l4_len, &nw_proto, &nw_frag, > > + NULL, NULL)) { > > + return NULL; > > + } > > + > > + if (nw_frag == FLOW_NW_FRAG_LATER) { > > + return NULL; > > + } > > + > > + if (nw_proto == IPPROTO_ICMPV6) { > > + const struct icmp6_header *icmp = l4_buf; > > + if (icmp && packet_icmpv6_is_err(icmp->icmp6_type)) { > > + return NULL; > > + } > > + } > > + > > + max_payload = 1280 - ETH_HEADER_LEN - IPV6_HEADER_LEN - > > + ICMP6_DATA_HEADER_LEN; > > + l3_len = l3_len < max_payload ? l3_len : max_payload; > > + > > + memcpy(&ip6_src, &ip6->ip6_dst, sizeof(ip6_dst)); > > + memcpy(&ip6_dst, &ip6->ip6_src, sizeof(ip6_dst)); > > + return compose_ipv6_ptb(eth->eth_dst, eth->eth_src, > > + &ip6_dst, &ip6_src, > > + htonl(mtu), l3, l3_len); > > + } else { > > + const struct ip_header *ip; > > + size_t icmp_payload_len; > > + size_t available; > > + > > + ip = (const struct ip_header *) l3; > > + > > + if (mtu < 576) { > > And here, why 576? > The 1280 above and 576 here are RFC specified minimum mtu's, both of these checks mirror how pmtu is implemented in the linux kernel. > > > + return NULL; > > + } > > + > > + if (!(ip->ip_frag_off & htons(IP_DF))) { > > + return NULL; > > + } > > + > > + if (ip_is_multicast(get_16aligned_be32(&ip->ip_src)) || > > + ip_is_broadcast(get_16aligned_be32(&ip->ip_src)) || > > + ip_is_loopback(get_16aligned_be32(&ip->ip_src))) { > > + return NULL; > > + } > > + > > + if (ip->ip_proto == IPPROTO_ICMP) { > > + const struct icmp_header *icmp = dp_packet_l4(packet); > > + if (icmp && packet_icmp_is_err(icmp->icmp_type)) { > > + return NULL; > > + } > > + } > > + icmp_payload_len = IP_IHL(ip->ip_ihl_ver) * 4 + > ICMP_ERROR_DATA_L4_LEN; > > + > > + available = l3_len; > > + if (icmp_payload_len > available) { > > + icmp_payload_len = available; > > + } > > + > > + return compose_ipv4_fn(eth->eth_dst, eth->eth_src, > > + get_16aligned_be32(&ip->ip_dst), > > + get_16aligned_be32(&ip->ip_src), > > + htons(mtu), l3, icmp_payload_len); > > + } > > +} > > + > > static int > > -push_tnl_action(const struct dp_netdev_pmd_thread *pmd, > > +push_tnl_action(struct dp_netdev_pmd_thread *pmd, > > const struct nlattr *attr, > > struct dp_packet_batch *batch) > > { > > - struct tx_port *tun_port; > > + size_t i, size = dp_packet_batch_size(batch); > > const struct ovs_action_push_tnl *data; > > + uint32_t *depth = recirc_depth_get(); > > + struct dp_packet *packet; > > + struct tx_port *tun_port; > > + struct netdev *netdev; > > + int mtu; > > int err; > > > > data = nl_attr_get(attr); > > @@ -9049,7 +9178,45 @@ push_tnl_action(const struct dp_netdev_pmd_thread > *pmd, > > err = -EINVAL; > > goto error; > > } > > - err = netdev_push_header(tun_port->port->netdev, batch, data); > > + > > + netdev = tun_port->port->netdev; > > + if (netdev->mtu_user_config && > > + netdev_get_mtu(netdev, &mtu) == 0) { > > + struct dp_packet_batch icmp_batch; > > + > > + dp_packet_batch_init(&icmp_batch); > > + DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { > > + int len = dp_packet_get_send_len(packet) + data->header_len; > > I suspect this will not work with TSO packets: > dp_packet_get_send_len() returns a length before segmentation and it > will not be the actual length of sent data. > I believe you are correct, also I think this is the second misuse of dp_packet_get_send_len() - out of three current uses - in a few months. Maybe that function should be renamed for clarity? As a side note, I was looking at the kernel module and unless I'm reading it wrong, queue_gso_packets() appears to segment the packet before applying cutlen whereas other areas appear to apply cutlen before segmentation. I don't know if that's intentional. I can send in a new version that appropriately accounts for seg size. Thanks, M > > > > + > > + struct dp_packet *icmp; > > + > > + icmp = netdev_generate_frag_needed(packet, len, mtu); > > + if (!icmp) { > > + dp_packet_batch_refill(batch, packet, i); > > + continue; > > + } > > + > > + dp_packet_delete(packet); > > + COVERAGE_INC(datapath_drop_tunnel_mtu_drop); > > + > > + pkt_metadata_init(&icmp->md, data->tnl_port); > > + > > + dp_packet_batch_add(&icmp_batch, icmp); > > + } > > + > > + if (*depth >= MAX_RECIRC_DEPTH) { > > + COVERAGE_ADD(datapath_drop_recirc_error, > > + dp_packet_batch_size(&icmp_batch)); > > + dp_packet_delete_batch(&icmp_batch, true); > > + } > > + > > + if (dp_packet_batch_size(&icmp_batch) > 0) { > > + (*depth)++; > > + dp_netdev_recirculate(pmd, &icmp_batch); > > + (*depth)--; > > + } > > + } > > + err = netdev_push_header(netdev, batch, data); > > if (!err) { > > return 0; > > } > > > -- > David Marchand > > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
