> -----Original Message-----
> From: Eelco Chaudron <[email protected]>
> Sent: Thursday 2 June 2022 15:43
> To: Finn, Emma <[email protected]>
> Cc: Van Haaren, Harry <[email protected]>; Amber, Kumar
> <[email protected]>; Stokes, Ian <[email protected]>;
> [email protected]
> Subject: Re: [v6 11/11] odp-execute: Add ISA implementation of
> set_masked IPv4 action
> 
> On 10 May 2022, at 16:22, Emma Finn wrote:
> 
> > This commit adds support for the AVX512 implementation of the
> > ipv4_set_addrs action as well as an AVX512 implementation of updating
> > the checksums.
> >
> > Signed-off-by: Emma Finn <[email protected]>
> > ---
> >  lib/odp-execute-avx512.c  | 194
> ++++++++++++++++++++++++++++++++++++++
> >  lib/odp-execute-private.c |   1 +
> >  lib/odp-execute.c         |  21 ++++-
> >  3 files changed, 211 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
> > ede00b750..618fa37a7 100644
> > --- a/lib/odp-execute-avx512.c
> > +++ b/lib/odp-execute-avx512.c
> > @@ -22,6 +22,7 @@
> >  #include <config.h>
> >  #include <errno.h>
> >
> > +#include "csum.h"
> >  #include "cpu.h"
> >  #include "dp-packet.h"
> >  #include "immintrin.h"
> > @@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp
> OVS_UNUSED, struct dp_packet_batch *batch,
> >      }
> >  }
> >
> > +static inline uint16_t ALWAYS_INLINE
> > +avx512_l4_update_csum(struct ip_header *old_header, __m256i res) {
> 
> Please add comments to the two below checksum functions, and I’ll do a
> full review in the next revision.
> 
> > +    uint16_t tmp_checksum;
> > +    __m256i v_zeros = _mm256_setzero_si256();
> > +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302,
> 0xffff,
> > +                                          0x0504, 0xffff, 0x0706, 0xffff,
> > +                                          0x0100, 0xffff, 0x0302, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff);
> > +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff,
> 0xffff,
> > +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff);
> > +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> > +                                          0xF, 0xF, 0xF, 0xF);
> > +
> > +    __m256i oh = _mm256_loadu_si256((void *) old_header);
> > +    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
> > +    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
> > +    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
> > +
> > +    /* Add field values. */
> > +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> > +
> > +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +
> > +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. 
> > */
> > +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> > +
> > +    /* Extract checksum value. */
> > +    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
> > +
> > +    return ~tmp_checksum;
> > +}
> > +
> > +static inline uint16_t ALWAYS_INLINE
> > +avx512_ipv4_recalc_csum(__m256i res)
> 
> Dont think this is a recalc, but just a new calculation, so maybe just call it
> avx512_ipv4_csum()?
> 
> > +{
> > +    uint32_t new_checksum;
> > +    __m256i v_zeros = _mm256_setzero_si256();
> > +
> > +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302,
> 0xffff,
> > +                                          0x0504, 0xffff, 0x0706, 0xffff,
> > +                                          0x0100, 0xffff, 0x0302, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff,
> > + 0xffff);
> > +
> > +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff,
> 0xffff,
> > +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff,
> > + 0xffff);
> > +
> > +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> > +                                          0xF, 0xF, 0xF, 0xF);
> 
> The above constant data seem to be the same as in
> avx512_l4_update_csum() so maybe define them as a constant and add
> some description to them.
> 
> > +
> > +    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
> > +    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
> > +
> > +    /* Add field values. */
> > +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> > +
> > +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +
> > +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. 
> > */
> > +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> > +
> > +    /* Extract new checksum value. */
> > +    new_checksum = _mm256_extract_epi16(v_sum, 0);
> > +
> > +    return ~new_checksum;
> 
> How are IP options handled here?
> 
> > +}
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_dst));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_proto));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_tos));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
> 
> Some comments on why we need these asserts and how they related to the
> code above/below.
> 
> > +
> > +static void
> > +action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
> > +                             struct dp_packet_batch *batch,
> > +                             const struct nlattr *a,
> > +                             bool should_steal OVS_UNUSED) {
> > +    a = nl_attr_get(a);
> > +    const struct ovs_key_ipv4 *key = nl_attr_get(a);
> > +    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
> > +    struct dp_packet *packet;
> > +    ovs_be16 old_csum;
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct ip_header *nh = dp_packet_l3(packet);
> > +        old_csum = nh->ip_csum;
> > +
> > +        __m256i v_key = _mm256_loadu_si256((void *) key);
> > +        __m256i v_mask = _mm256_loadu_si256((void *) mask);
> 
> These two are not overwritten can we load them outside the loop?
> 
> > +        __m256i v_packet = _mm256_loadu_si256((void *) nh);
> > +
> > +        /* Shuffle key and mask to match ip_header struct layout. */
> > +        static const uint8_t ip_shuffle_mask[32] = {
> > +            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
> > +            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
> 
> Explain what the shuffle here?
> 
> > +        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
> > +                                             0x1, 0xF, 0xF, 0xF);
> 
> v_shuf32 makes no sense to me, can we add a comment on what we do?
> Here and all the code below, as I do not want to figure it out each review ;)
> 
> > +
> > +        __m256i v_shuffle = _mm256_loadu_si256((void *)
> > + ip_shuffle_mask);
> > +
> > +        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32,
> v_key);
> > +        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
> > +
> > +        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32,
> v_mask);
> > +        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
> > +
> > +        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf,
> v_packet);
> > +        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
> > +
> > +        /* Update checksum. */
> > +        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
> 
> > +
> > +        /* Insert new checksum. */
> > +        v_res = _mm256_insert_epi16(v_res, checksum, 5);
> > +
> > +       /* If ip_src or ip_dst has been modified, L4 checksum needs to
> > +        * be updated too.
> > +        */
> > +        int update_mask = _mm256_movemask_epi8(v_mask);
> > +        if (update_mask & 0xFF) {
> > +
> > +            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
> > +            tmp_checksum = ~tmp_checksum;
> > +            uint16_t csum;
> > +
> > +            if (nh->ip_proto == IPPROTO_UDP) {
> > +                /* New UDP checksum. */
> > +                struct udp_header *uh = dp_packet_l4(packet);
> 
> This can be moved under the if statement below.
> 
> > +                if (uh->udp_csum) {
> > +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> > +
> > +                    uint32_t udp_checksum = old_csum + tmp_checksum;
> > +                    udp_checksum = csum_finish(udp_checksum);
> > +                    uint16_t udp_csum = ~udp_checksum;
> > +
> > +                    uint32_t nw_udp_checksum = udp_csum +
> > + old_udp_checksum;
> > +
> > +                    csum =  csum_finish(nw_udp_checksum);
> > +
> 
> I think there is this special UDP case that might need handling here:
> 
> if (!csum) {
>      csum = htons(0xffff);
> 
> > +                    /* Insert new udp checksum. */
> > +                    v_res = _mm256_insert_epi16(v_res, csum, 13);
> > +                }
> > +            }
> 
> Guess this could be an " } else if if (nh->ip_proto == IPPROTO_TCP) {"
> 
> > +            if (nh->ip_proto == IPPROTO_TCP) {
> > +                /* New TCP checksum. */
> > +                struct tcp_header *th = dp_packet_l4(packet);
> > +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> > +
> > +                uint32_t tcp_checksum = old_csum + tmp_checksum;
> > +                tcp_checksum = csum_finish(tcp_checksum);
> > +                uint16_t tcp_csum = ~tcp_checksum;
> > +
> > +                uint32_t nw_tcp_checksum = tcp_csum +
> > + old_tcp_checksum;
> > +
> > +                csum =  csum_finish(nw_tcp_checksum);
> > +
> > +                th->tcp_csum = csum;
> > +            }
> > +        }
> > +
> > +        /* Store new IP header. */
> > +        _mm256_storeu_si256((void *) nh, v_res);
> > +    }
> > +}
> > +
> 
> Did not review the below, as the architecture needs changing (see the
> previous patch).
> 
> >  static void
> >  action_avx512_set_masked(void *dp OVS_UNUSED,
> >                           struct dp_packet_batch *batch OVS_UNUSED, @@
> > -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl
> *self)
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] =
> action_avx512_set_masked;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> >                              action_avx512_eth_set_addrs;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > +                            action_avx512_ipv4_set_addrs;
> >      active_impl = *self;
> >
> >      return 0;
> > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> > index 34f13523a..cb77bab31 100644
> > --- a/lib/odp-execute-private.c
> > +++ b/lib/odp-execute-private.c
> > @@ -284,6 +284,7 @@ action_autoval_init(struct
> odp_execute_action_impl *self)
> >      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] =
> action_autoval_generic;
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] =
> action_set_masked_init;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> > action_autoval_generic;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > + action_autoval_generic;
> >      active_impl = *self;
> >
> >      return 0;
> > diff --git a/lib/odp-execute.c b/lib/odp-execute.c index
> > 5c4dd8e33..cbf528f93 100644
> > --- a/lib/odp-execute.c
> > +++ b/lib/odp-execute.c
> > @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct
> dp_packet *packet,
> >          break;
> >      }
> >
> > -    case OVS_KEY_ATTR_IPV4:
> > -        odp_set_ipv4(packet, nl_attr_get(a),
> > -                     get_mask(a, struct ovs_key_ipv4));
> > -        break;
> > -
> >      case OVS_KEY_ATTR_IPV6:
> >          odp_set_ipv6(packet, nl_attr_get(a),
> >                       get_mask(a, struct ovs_key_ipv6)); @@ -657,6
> > +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
> >      case OVS_KEY_ATTR_ETHERNET:
> >      case OVS_KEY_ATTR_ETHERTYPE:
> >      case OVS_KEY_ATTR_IN_PORT:
> > +    case OVS_KEY_ATTR_IPV4:
> >      case OVS_KEY_ATTR_VLAN:
> >      case OVS_KEY_ATTR_ICMP:
> >      case OVS_KEY_ATTR_ICMPV6:
> > @@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct
> dp_packet_batch *batch,
> >      }
> >  }
> >
> > +static void
> > +action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch
> *batch,
> > +                const struct nlattr *a OVS_UNUSED,
> > +                bool should_steal OVS_UNUSED) {
> > +    a = nl_attr_get(a);
> > +    struct dp_packet *packet;
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        odp_set_ipv4(packet, nl_attr_get(a),
> > +                     get_mask(a, struct ovs_key_ipv4));
> > +    }
> > +}
> > +
> >  /* Implementation of the scalar actions impl init function. Build up the
> >   * array of func ptrs here.
> >   */
> > @@ -902,6 +912,7 @@ odp_action_scalar_init(struct
> odp_execute_action_impl *self)
> >      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> action_mod_eth;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
> >      actions_active_impl = *self;
> >
> >      return 0;
> > --
> > 2.25.1
> 
> This completes my initial review of this series, let me know if something is
> not clear. I have not done any actual testing on an AVX machine, but I will
> try to do that on the next revision.
> 
> Also, wondering how you tested performance on all of this? Maybe you can
> some details to the cover letter on how the relative performance numbers
> were gathered? Also wondering if you tested all of this without including
> DPDK in your build (it was/is on my TODO but I have not AVX machine yet)?
> 
> Cheers,
> 
> Eelco

Hi Eelco, 

Thanks for the review. Will take the comments on board and start re working for 
the next revision. 

Thanks, 
Emma 

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to