Manual unroll makes sense! Are you OK if we land minimal changes for
__rte_raw_cksum optimization and consider manual unrolling of
ipv4/ipv6 headers as a follow up? Morten requested I break the
patch up and minimize changes (I'm working on this now). If these
were the only cases causing pain for my patch it makes more sense
to do the unroll first, but there are other cases to consider:

- mlx5_flow_dv.c usage of __rte_raw_cksum could arguably
be unrolled too, but consider a trade-off of spreading around
manual unroll code. one option is for rte_cksum.h to have
specialized unrolled length functions to keep the code consolidated
(but then additional API surface).
- hinic_pmd_tx.c - should call rte_ipv6_phdr_cksum and
rte_ipv4_phdr_cksum instead of duplicating logic.


On Fri, Jan 9, 2026 at 12:39 AM Stephen Hemminger
<[email protected]> wrote:
>
> On Thu,  8 Jan 2026 16:47:13 -0500
> [email protected] wrote:
>
> > diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> > index d1abf1f5d5..8a7e5e4b8a 100644
> > --- a/lib/net/rte_ip6.h
> > +++ b/lib/net/rte_ip6.h
> > @@  -560,19 +560,18 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr 
> > *ipv6_hdr, uint64_t ol_flags)
> >  static inline uint16_t
> >  rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr *ipv6_hdr, uint64_t ol_flags)
> >  {
> > -     uint32_t sum;
> >       struct {
> >               rte_be32_t len;   /* L4 length. */
> >               rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero */
> > -     } psd_hdr;
> > -
> > -     psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
> > -     if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
> > -             psd_hdr.len = 0;
> > -     else
> > -             psd_hdr.len = ipv6_hdr->payload_len;
> > +     } psd_hdr = {
> > +             .len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | 
> > RTE_MBUF_F_TX_UDP_SEG))
> > +                     ? (rte_be32_t)0
> > +                     : ipv6_hdr->payload_len,
> > +             .proto = (uint32_t)(ipv6_hdr->proto << 24)
> > +     };
> > +     RTE_SUPPRESS_UNINITIALIZED_WARNING(psd_hdr);
> >
> > -     sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
> > +     uint32_t sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
> >               sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr),
> >               0);
> >       sum = __rte_raw_cksum(&psd_hdr, sizeof(psd_hdr), sum);
> > --
>
> Seems like this could be unrolled as well.
>
> static inline uint16_t
> rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr *ipv6_hdr, uint64_t ol_flags)
> {
>         union {
>                 struct {
>                         struct rte_ipv6_addr src_addr;  /* 16 bytes */
>                         struct rte_ipv6_addr dst_addr;  /* 16 bytes */
>                         rte_be32_t len;                 /* 4 bytes  */
>                         rte_be32_t proto;               /* 4 bytes  */
>                 } psd;
>                 uint16_t u16[20];
>         } hdr = {
>                 .psd = {
>                         .src_addr = ipv6_hdr->src_addr,
>                         .dst_addr = ipv6_hdr->dst_addr,
>                         .proto = (uint32_t)(ipv6_hdr->proto << 24),
>                 }
>         };
>         uint32_t sum;
>
>         if (!(ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)))
>                 hdr.psd.len = ipv6_hdr->payload_len;
>
>         /* Unrolled sum of 20 uint16_t words:
>          * [0-7]:   src_addr
>          * [8-15]:  dst_addr
>          * [16-17]: len
>          * [18-19]: proto (3 zero bytes + next header)
>          */
>         sum = hdr.u16[0]  + hdr.u16[1]  + hdr.u16[2]  + hdr.u16[3]  +
>               hdr.u16[4]  + hdr.u16[5]  + hdr.u16[6]  + hdr.u16[7]  +
>               hdr.u16[8]  + hdr.u16[9]  + hdr.u16[10] + hdr.u16[11] +
>               hdr.u16[12] + hdr.u16[13] + hdr.u16[14] + hdr.u16[15] +
>               hdr.u16[16] + hdr.u16[17] + hdr.u16[18] + hdr.u16[19];
>
>         sum = (sum & 0xffff) + (sum >> 16);
>         sum = (sum & 0xffff) + (sum >> 16);
>         return (uint16_t)sum;
> }

Reply via email to