On 26 Sep 2022, at 15:29, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
>
> Signed-off-by: Emma Finn <[email protected]>

Hi Emma,

Thanks for further enhancing the implementation of the AVX512 actions. Below 
are some comments, mostly style related, but with one additional optimization.

Cheers,

Eelco

> ---
> v3:
>   - Added a runtime check for AVX512 vbmi.
> v2:
>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>   - Fixed network headers for freebsd builds.
> ---
> ---
>  lib/odp-execute-avx512.c  | 176 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |  17 ++++
>  lib/odp-execute-private.h |   1 +
>  3 files changed, 194 insertions(+)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 6c7713251..f97b3c2f7 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -20,6 +20,9 @@
>
>  #include <config.h>
>  #include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
>
>  #include "csum.h"
>  #include "dp-packet.h"
> @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch 
> *batch,
>      }
>  }
>
> +#if HAVE_AVX512VBMI
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_get_delta(__m512i ip6_header)

I guess the function name was from before you split up this function, as it's 
not at all what you're doing in this function.
I would suggest changing the name to something like avx512_ipv6_sum_header().

Also, can you go over the register naming and comment text below, as they also 
make no sense in the current form?

> +{
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
> +                                               0xFF, 0xFF, 0xFF, 0xFF);
> +
> +    __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, ip6_header);
> +    __m256i v_ip6_src_dst =  _mm512_extracti64x4_epi64(v_header, 0);

Remove the extra space after the equal sign.

Please add a new line before the comment.

> +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
> +     * src and dst fields and add padding after each 16-bit value for the
> +     * following carry over addition. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
> +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
> +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
> +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
> +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
> +
> +    /* Add each part of the old and new headers together. */
> +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final
> +     * horizontal add. */
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> +
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> +
> +    /* Extract delta value. */
> +    return _mm256_extract_epi16(v_delta, 0);
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
> +{
> +    uint16_t delta;
> +    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
> +    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
> +    old_delta = ~old_delta;
> +    uint32_t csum_delta = old_delta + new_delta;
> +    delta = csum_finish(csum_delta);
> +
> +    return ~delta;

This function looks rather cluttered, what about the following:

{
    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
    uint32_t csum_delta = ~old_delta + new_delta;

    return ~csum_finish(csum_delta);
}

> +}
> +
> +/* This function performs the same operation on each packet in the batch as
> + * the scalar odp_set_ipv6() function. */
> +static void
> +__attribute__((__target__("avx512vbmi")))
> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> +                             const struct nlattr *a)
> +{
> +    const struct ovs_key_ipv6 *key, *mask;
> +    struct dp_packet *packet;

Add a new line between definitions and code.

> +    a = nl_attr_get(a);
> +    key = nl_attr_get(a);
> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);

We have build asserts for the ovs_key_ipv4 key structure to make sure they do 
not change, we should add the same for v6.

> +
> +    /* Read the content of the key and mask in the respective registers. We
> +     * only load the size of the actual structure, which is only 40 bytes. */
> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> +
> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> +     * ip6_hdr structure layout. */
> +    static const uint8_t ip_shuffle_mask[64] = {
> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> +    };
> +
> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> +
> +    /* This shuffle is required for key and mask to match the layout of the
> +     * ip6_hdr struct. */
> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> +        /* Load the 40 bytes of the IPv6 header. */
> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> +        /* AND the v_pkt_mask to the packet data (v_packet). */
> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> +        /* OR the new addresses (v_key_shuf) with the masked packet addresses
> +         * (v_pkt_masked). */
> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> +         * be updated. */
> +        bool do_checksum = false;

So as we are trying to optimise code, this only needs to be done once, so we 
should move this outside the DP_PACKET_BATCH_FOR_EACH() loop.

> +#ifdef s6_addr32
> +        for (int j = 0; j < 4; j++) {
> +            if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) {
> +                do_checksum = true;
> +            }
> +        }
> +#else
> +        for (int j = 0; j < 16; j++) {
> +             if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) {
> +                do_checksum = true;
> +            }
> +        }
> +#endif

Not sure how fast slow/fast the above is compared with doing an AVX512 AND on 
the v_mask with a new v_address_mask + popcount?

> +        if (do_checksum) {
> +            uint8_t proto = nh->ip6_nxt;
> +            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> +                                                                  v_new_hdr);
> +
> +            if (proto == IPPROTO_UDP) {
> +                struct udp_header *uh = dp_packet_l4(packet);

Add a new line here.

> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +                    uint32_t udp_checksum = old_udp_checksum + 
> delta_checksum;

Add a new line here.

> +                    udp_checksum = csum_finish(udp_checksum);
> +
> +                    if (!udp_checksum) {
> +                        udp_checksum = htons(0xffff);
> +                    }
> +
> +                    uh->udp_csum = udp_checksum;
> +                }
> +            } else if (proto == IPPROTO_TCP) {
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;

Add a new line here.

> +                tcp_checksum = csum_finish(tcp_checksum);
> +

Remove the new line.

> +                th->tcp_csum = tcp_checksum;
> +            } else if (proto == IPPROTO_ICMPV6) {
> +                struct icmp6_header *icmp = dp_packet_l4(packet);
> +                uint16_t old_icmp_checksum = ~icmp->icmp6_cksum;

Keep name consistency, so I would call it old_icmp6_checksum.

> +                uint32_t icmp6_checksum = old_icmp_checksum + delta_checksum;

Add a new line here.

> +                icmp6_checksum = csum_finish(icmp6_checksum);
> +

Remove the new line.

> +                icmp->icmp6_cksum = icmp6_checksum;
> +            }
> +        }
> +        /* Write back the modified IPv6 addresses. */
> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> +    }
> +}
> +#endif

As the #if is way out of sight, I would make this +#endif /* HAVE_AVX512VBMI */

> +
>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr 
> *a)
>  {
> @@ -514,6 +683,13 @@ action_avx512_init(struct odp_execute_action_impl *self 
> OVS_UNUSED)
>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = 
> action_avx512_eth_set_addrs;
>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>
> +#if HAVE_AVX512VBMI
> +    if (action_avx512vbmi_isa_probe()) {
> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> +                              action_avx512_ipv6_set_addrs;
> +    }
> +#endif
> +
>      return 0;
>  }
>
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..ff29e116f 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>
>  #endif
>
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> +        return false;
> +    }
> +    return true;
> +}

just a nit, but I would make this as follows:

bool
action_avx512vbmi_isa_probe(void)
{
    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
        return true;
    }
    return false;
}

> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return false;
> +}
> +#endif
> +
>  static struct odp_execute_action_impl action_impls[] = {
>      [ACTION_IMPL_AUTOVALIDATOR] = {
>          .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>  #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>
>  bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>
>  /* Odp execute init handles setting up the state of the actions functions at
>   * initialization time. It cannot return errors, as it must always succeed in
> -- 
> 2.25.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to