> I just tested building our application with modified macros using GCC.
> With the changes below, GCC emitted slightly more efficient code (fewer 
> instructions).
> It shows that using uintptr_t instead of normal pointer arithmetic confuses 
> the compiler.
>
> /**
>   * add a byte-value offset to a const pointer
>   */
> -#define RTE_CONST_PTR_ADD(ptr, x) ((const void*)((uintptr_t)(ptr) + (x)))
> +#define RTE_CONST_PTR_ADD(ptr, x) ((const void*)((const char*)(ptr) + (x)))
>
>  /**
>   * subtract a byte-value offset from a const pointer
>   */
> -#define RTE_CONST_PTR_SUB(ptr, x) ((const void*)((uintptr_t)(ptr) - (x)))
> +#define RTE_CONST_PTR_SUB(ptr, x) ((const void*)((const char*)(ptr) - (x)))
>
>
> Snippet of code emitted using const char*:
>
>     if (likely((eth->ether_type == RTE_BE16(RTE_ETHER_TYPE_VLAN)) |
>   4dc47e:       41 0f b7 45 0c          movzwl 0xc(%r13),%eax
>   4dc483:       89 c7                   mov    %eax,%edi
>   4dc485:       83 e7 ef                and    $0xffffffef,%edi
>   4dc488:       66 81 ff 81 00          cmp    $0x81,%di
>   4dc48d:       74 0a                   je     4dc499 
> <service_ingress_dedicated_management+0x1b9>
>   4dc48f:       66 3d 88 a8             cmp    $0xa888,%ax
>   4dc493:       0f 85 3f 01 00 00       jne    4dc5d8 
> <service_ingress_dedicated_management+0x2f8>
>         if (vhdr->eth_proto == RTE_BE16(RTE_ETHER_TYPE_VLAN)) {
>   4dc499:       66 41 81 7d 10 81 00    cmpw   $0x81,0x10(%r13)
>   4dc4a0:       0f 84 1a 01 00 00       je     4dc5c0 
> <service_ingress_dedicated_management+0x2e0>
>             eth = RTE_CONST_PTR_ADD(eth, sizeof(struct rte_vlan_hdr));
>   4dc4a6:       49 8d 7d 04             lea    0x4(%r13),%rdi
>             packet_type = (union packet_type){
>   4dc4aa:       b8 01 00 00 00          mov    $0x1,%eax
>   4dc4af:       41 b8 12 00 00 00       mov    $0x12,%r8d
>     if (likely((*(const rte_be32_t *)RTE_CONST_PTR_ADD(eth, offsetof(struct 
> rte_ether_hdr, ether_type)) & RTE_BE32(0xFFFFFF00)) ==
>   4dc4b5:       44 8b 4f 0c             mov    0xc(%rdi),%r9d
>
> Snippet of code emitted using uintptr_t:
>
>     if (likely((eth->ether_type == RTE_BE16(RTE_ETHER_TYPE_VLAN)) |
>   4dc47e:       41 0f b7 45 0c          movzwl 0xc(%r13),%eax
> +            eth = RTE_CONST_PTR_ADD(eth, 2 * sizeof(struct rte_vlan_hdr));
> +  4dc483:      4c 89 ef                mov    %r13,%rdi
> +    if (likely((eth->ether_type == RTE_BE16(RTE_ETHER_TYPE_VLAN)) |
>   4dc486:       41 89 c0                mov    %eax,%r8d
>   4dc489:       41 83 e0 ef             and    $0xffffffef,%r8d
>   4dc48d:       66 41 81 f8 81 00       cmp    $0x81,%r8w
>   4dc493:       74 0a                   je     4dc49f 
> <service_ingress_dedicated_management+0x1bf>
>   4dc495:       66 3d 88 a8             cmp    $0xa888,%ax
>   4dc499:       0f 85 51 01 00 00       jne    4dc5f0 
> <service_ingress_dedicated_management+0x310>
>         if (vhdr->eth_proto == RTE_BE16(RTE_ETHER_TYPE_VLAN)) {
>   4dc49f:       66 41 81 7d 10 81 00    cmpw   $0x81,0x10(%r13)
>   4dc4a6:       0f 84 24 01 00 00       je     4dc5d0 
> <service_ingress_dedicated_management+0x2f0>
>             eth = RTE_CONST_PTR_ADD(eth, sizeof(struct rte_vlan_hdr));
>   4dc4ac:       49 8d 7d 04             lea    0x4(%r13),%rdi
>             packet_type = (union packet_type){
>   4dc4b0:       b8 01 00 00 00          mov    $0x1,%eax
>   4dc4b5:       41 b8 12 00 00 00       mov    $0x12,%r8d
> +            eth = RTE_CONST_PTR_ADD(eth, sizeof(struct rte_vlan_hdr));
> +  4dc4bb:      49 89 f9                mov    %rdi,%r9
>     if (likely((*(const rte_be32_t *)RTE_CONST_PTR_ADD(eth, offsetof(struct 
> rte_ether_hdr, ether_type)) & RTE_BE32(0xFFFFFF00)) ==
>   4dc4be:       8b 7f 0c                mov    0xc(%rdi),%edi

https://godbolt.org/z/5bc1bTrhe
In addition to less optimal assembly, it also prevents clang from
vectorizing and unrolling (even with constant length).

Reply via email to