Re: [ovs-dev] [v6 11/11] odp-execute: Add ISA implementation of set_masked IPv4 action

Eelco Chaudron Thu, 02 Jun 2022 07:43:45 -0700

On 10 May 2022, at 16:22, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv4_set_addrs action as well as an AVX512 implementation of
> updating the checksums.
>
> Signed-off-by: Emma Finn <[email protected]>
> ---
>  lib/odp-execute-avx512.c  | 194 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |   1 +
>  lib/odp-execute.c         |  21 ++++-
>  3 files changed, 211 insertions(+), 5 deletions(-)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index ede00b750..618fa37a7 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -22,6 +22,7 @@
>  #include <config.h>
>  #include <errno.h>
>
> +#include "csum.h"
>  #include "cpu.h"
>  #include "dp-packet.h"
>  #include "immintrin.h"
> @@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp OVS_UNUSED, struct 
> dp_packet_batch *batch,
>      }
>  }
>
> +static inline uint16_t ALWAYS_INLINE
> +avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
> +{


Please add comments to the two below checksum functions, and I’ll do a full 
review in the next revision.

> +    uint16_t tmp_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    __m256i oh = _mm256_loadu_si256((void *) old_header);
> +    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
> +
> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract checksum value. */
> +    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~tmp_checksum;
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +avx512_ipv4_recalc_csum(__m256i res)

Dont think this is a recalc, but just a new calculation, so maybe just call it
avx512_ipv4_csum()?

> +{
> +    uint32_t new_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);

The above constant data seem to be the same as in avx512_l4_update_csum() so 
maybe define them as a constant and add some description to them.

> +
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
> +
> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract new checksum value. */
> +    new_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~new_checksum;

How are IP options handled here?

> +}
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_dst));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_proto));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_tos));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_ttl));

Some comments on why we need these asserts and how they related to the code 
above/below.

> +
> +static void
> +action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
> +                             struct dp_packet_batch *batch,
> +                             const struct nlattr *a,
> +                             bool should_steal OVS_UNUSED)
> +{
> +    a = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *key = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
> +    struct dp_packet *packet;
> +    ovs_be16 old_csum;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ip_header *nh = dp_packet_l3(packet);
> +        old_csum = nh->ip_csum;
> +
> +        __m256i v_key = _mm256_loadu_si256((void *) key);
> +        __m256i v_mask = _mm256_loadu_si256((void *) mask);

These two are not overwritten can we load them outside the loop?

> +        __m256i v_packet = _mm256_loadu_si256((void *) nh);
> +
> +        /* Shuffle key and mask to match ip_header struct layout. */
> +        static const uint8_t ip_shuffle_mask[32] = {
> +            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
> +            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

Explain what the shuffle here?

> +        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
> +                                             0x1, 0xF, 0xF, 0xF);

v_shuf32 makes no sense to me, can we add a comment on what we do? Here and all 
the code below, as I do not want to figure it out each review ;)

> +
> +        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
> +
> +        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
> +        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
> +
> +        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
> +        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
> +
> +        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
> +        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
> +
> +        /* Update checksum. */
> +        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);

> +
> +        /* Insert new checksum. */
> +        v_res = _mm256_insert_epi16(v_res, checksum, 5);
> +
> +       /* If ip_src or ip_dst has been modified, L4 checksum needs to
> +        * be updated too.
> +        */
> +        int update_mask = _mm256_movemask_epi8(v_mask);
> +        if (update_mask & 0xFF) {
> +
> +            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
> +            tmp_checksum = ~tmp_checksum;
> +            uint16_t csum;
> +
> +            if (nh->ip_proto == IPPROTO_UDP) {
> +                /* New UDP checksum. */
> +                struct udp_header *uh = dp_packet_l4(packet);

This can be moved under the if statement below.

> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +
> +                    uint32_t udp_checksum = old_csum + tmp_checksum;
> +                    udp_checksum = csum_finish(udp_checksum);
> +                    uint16_t udp_csum = ~udp_checksum;
> +
> +                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
> +
> +                    csum =  csum_finish(nw_udp_checksum);
> +

I think there is this special UDP case that might need handling here:

if (!csum) {
     csum = htons(0xffff);

> +                    /* Insert new udp checksum. */
> +                    v_res = _mm256_insert_epi16(v_res, csum, 13);
> +                }
> +            }

Guess this could be an " } else if if (nh->ip_proto == IPPROTO_TCP) {"

> +            if (nh->ip_proto == IPPROTO_TCP) {
> +                /* New TCP checksum. */
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +
> +                uint32_t tcp_checksum = old_csum + tmp_checksum;
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                uint16_t tcp_csum = ~tcp_checksum;
> +
> +                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
> +
> +                csum =  csum_finish(nw_tcp_checksum);
> +
> +                th->tcp_csum = csum;
> +            }
> +        }
> +
> +        /* Store new IP header. */
> +        _mm256_storeu_si256((void *) nh, v_res);
> +    }
> +}
> +

Did not review the below, as the architecture needs changing (see the previous 
patch).

>  static void
>  action_avx512_set_masked(void *dp OVS_UNUSED,
>                           struct dp_packet_batch *batch OVS_UNUSED,
> @@ -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl *self)
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
>                              action_avx512_eth_set_addrs;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> +                            action_avx512_ipv4_set_addrs;
>      active_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index 34f13523a..cb77bab31 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -284,6 +284,7 @@ action_autoval_init(struct odp_execute_action_impl *self)
>      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_autoval_generic;
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked_init;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
>      active_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index 5c4dd8e33..cbf528f93 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>          break;
>      }
>
> -    case OVS_KEY_ATTR_IPV4:
> -        odp_set_ipv4(packet, nl_attr_get(a),
> -                     get_mask(a, struct ovs_key_ipv4));
> -        break;
> -
>      case OVS_KEY_ATTR_IPV6:
>          odp_set_ipv6(packet, nl_attr_get(a),
>                       get_mask(a, struct ovs_key_ipv6));
> @@ -657,6 +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>      case OVS_KEY_ATTR_ETHERNET:
>      case OVS_KEY_ATTR_ETHERTYPE:
>      case OVS_KEY_ATTR_IN_PORT:
> +    case OVS_KEY_ATTR_IPV4:
>      case OVS_KEY_ATTR_VLAN:
>      case OVS_KEY_ATTR_ICMP:
>      case OVS_KEY_ATTR_ICMPV6:
> @@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct 
> dp_packet_batch *batch,
>      }
>  }
>
> +static void
> +action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
> +                const struct nlattr *a OVS_UNUSED,
> +                bool should_steal OVS_UNUSED)
> +{
> +    a = nl_attr_get(a);
> +    struct dp_packet *packet;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        odp_set_ipv4(packet, nl_attr_get(a),
> +                     get_mask(a, struct ovs_key_ipv4));
> +    }
> +}
> +
>  /* Implementation of the scalar actions impl init function. Build up the
>   * array of func ptrs here.
>   */
> @@ -902,6 +912,7 @@ odp_action_scalar_init(struct odp_execute_action_impl 
> *self)
>      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
>      actions_active_impl = *self;
>
>      return 0;
> -- 
> 2.25.1

This completes my initial review of this series, let me know if something is 
not clear. I have not done any actual testing on an AVX machine, but I will try 
to do that on the next revision.

Also, wondering how you tested performance on all of this? Maybe you can some 
details to the cover letter on how the relative performance numbers were 
gathered? Also wondering if you tested all of this without including DPDK in 
your build (it was/is on my TODO but I have not AVX machine yet)?

Cheers,

Eelco

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [v6 11/11] odp-execute: Add ISA implementation of set_masked IPv4 action

Reply via email to