> -----Original Message----- > From: Eelco Chaudron <[email protected]> > Sent: Thursday 2 June 2022 15:43 > To: Finn, Emma <[email protected]> > Cc: Van Haaren, Harry <[email protected]>; Amber, Kumar > <[email protected]>; Stokes, Ian <[email protected]>; > [email protected] > Subject: Re: [v6 11/11] odp-execute: Add ISA implementation of > set_masked IPv4 action > > On 10 May 2022, at 16:22, Emma Finn wrote: > > > This commit adds support for the AVX512 implementation of the > > ipv4_set_addrs action as well as an AVX512 implementation of updating > > the checksums. > > > > Signed-off-by: Emma Finn <[email protected]> > > --- > > lib/odp-execute-avx512.c | 194 > ++++++++++++++++++++++++++++++++++++++ > > lib/odp-execute-private.c | 1 + > > lib/odp-execute.c | 21 ++++- > > 3 files changed, 211 insertions(+), 5 deletions(-) > > > > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index > > ede00b750..618fa37a7 100644 > > --- a/lib/odp-execute-avx512.c > > +++ b/lib/odp-execute-avx512.c > > @@ -22,6 +22,7 @@ > > #include <config.h> > > #include <errno.h> > > > > +#include "csum.h" > > #include "cpu.h" > > #include "dp-packet.h" > > #include "immintrin.h" > > @@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp > OVS_UNUSED, struct dp_packet_batch *batch, > > } > > } > > > > +static inline uint16_t ALWAYS_INLINE > > +avx512_l4_update_csum(struct ip_header *old_header, __m256i res) { > > Please add comments to the two below checksum functions, and I’ll do a > full review in the next revision. > > > + uint16_t tmp_checksum; > > + __m256i v_zeros = _mm256_setzero_si256(); > > + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, > 0xffff, > > + 0x0504, 0xffff, 0x0706, 0xffff, > > + 0x0100, 0xffff, 0x0302, 0xffff, > > + 0xffff, 0xffff, 0xffff, 0xffff); > > + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, > 0xffff, > > + 0x0d0c, 0xffff, 0x0f0e, 0xffff, > > + 0xffff, 0xffff, 0xffff, 0xffff, > > + 0xffff, 0xffff, 0xffff, 0xffff); > > + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, > > + 0xF, 0xF, 0xF, 0xF); > > + > > + __m256i oh = _mm256_loadu_si256((void *) old_header); > > + oh = _mm256_mask_blend_epi16(0x3C0, oh, res); > > + __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a); > > + __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b); > > + > > + /* Add field values. */ > > + __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2); > > + > > + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + > > + /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. > > */ > > + v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum); > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + v_sum = _mm256_hadd_epi16(v_sum, v_zeros); > > + > > + /* Extract checksum value. */ > > + tmp_checksum = _mm256_extract_epi16(v_sum, 0); > > + > > + return ~tmp_checksum; > > +} > > + > > +static inline uint16_t ALWAYS_INLINE > > +avx512_ipv4_recalc_csum(__m256i res) > > Dont think this is a recalc, but just a new calculation, so maybe just call it > avx512_ipv4_csum()? > > > +{ > > + uint32_t new_checksum; > > + __m256i v_zeros = _mm256_setzero_si256(); > > + > > + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, > 0xffff, > > + 0x0504, 0xffff, 0x0706, 0xffff, > > + 0x0100, 0xffff, 0x0302, 0xffff, > > + 0xffff, 0xffff, 0xffff, > > + 0xffff); > > + > > + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, > 0xffff, > > + 0x0d0c, 0xffff, 0x0f0e, 0xffff, > > + 0xffff, 0xffff, 0xffff, 0xffff, > > + 0xffff, 0xffff, 0xffff, > > + 0xffff); > > + > > + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, > > + 0xF, 0xF, 0xF, 0xF); > > The above constant data seem to be the same as in > avx512_l4_update_csum() so maybe define them as a constant and add > some description to them. > > > + > > + __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a); > > + __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b); > > + > > + /* Add field values. */ > > + __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2); > > + > > + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + > > + /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. > > */ > > + v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum); > > + v_sum = _mm256_hadd_epi32(v_sum, v_zeros); > > + v_sum = _mm256_hadd_epi16(v_sum, v_zeros); > > + > > + /* Extract new checksum value. */ > > + new_checksum = _mm256_extract_epi16(v_sum, 0); > > + > > + return ~new_checksum; > > How are IP options handled here? > > > +} > > + > > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) + > > + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) == > > + offsetof(struct ovs_key_ipv4, ipv4_dst)); > > + > > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) + > > + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) == > > + offsetof(struct ovs_key_ipv4, ipv4_proto)); > > + > > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) + > > + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) == > > + offsetof(struct ovs_key_ipv4, ipv4_tos)); > > + > > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) + > > + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) == > > + offsetof(struct ovs_key_ipv4, ipv4_ttl)); > > Some comments on why we need these asserts and how they related to the > code above/below. > > > + > > +static void > > +action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED, > > + struct dp_packet_batch *batch, > > + const struct nlattr *a, > > + bool should_steal OVS_UNUSED) { > > + a = nl_attr_get(a); > > + const struct ovs_key_ipv4 *key = nl_attr_get(a); > > + const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4); > > + struct dp_packet *packet; > > + ovs_be16 old_csum; > > + > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + struct ip_header *nh = dp_packet_l3(packet); > > + old_csum = nh->ip_csum; > > + > > + __m256i v_key = _mm256_loadu_si256((void *) key); > > + __m256i v_mask = _mm256_loadu_si256((void *) mask); > > These two are not overwritten can we load them outside the loop? > > > + __m256i v_packet = _mm256_loadu_si256((void *) nh); > > + > > + /* Shuffle key and mask to match ip_header struct layout. */ > > + static const uint8_t ip_shuffle_mask[32] = { > > + 0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, > > + 6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, > > + 0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF, > > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; > > Explain what the shuffle here? > > > + __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF, > > + 0x1, 0xF, 0xF, 0xF); > > v_shuf32 makes no sense to me, can we add a comment on what we do? > Here and all the code below, as I do not want to figure it out each review ;) > > > + > > + __m256i v_shuffle = _mm256_loadu_si256((void *) > > + ip_shuffle_mask); > > + > > + __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, > v_key); > > + v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle); > > + > > + __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, > v_mask); > > + v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle); > > + > > + __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, > v_packet); > > + __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked); > > + > > + /* Update checksum. */ > > + uint16_t checksum = avx512_ipv4_recalc_csum(v_res); > > > + > > + /* Insert new checksum. */ > > + v_res = _mm256_insert_epi16(v_res, checksum, 5); > > + > > + /* If ip_src or ip_dst has been modified, L4 checksum needs to > > + * be updated too. > > + */ > > + int update_mask = _mm256_movemask_epi8(v_mask); > > + if (update_mask & 0xFF) { > > + > > + uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res); > > + tmp_checksum = ~tmp_checksum; > > + uint16_t csum; > > + > > + if (nh->ip_proto == IPPROTO_UDP) { > > + /* New UDP checksum. */ > > + struct udp_header *uh = dp_packet_l4(packet); > > This can be moved under the if statement below. > > > + if (uh->udp_csum) { > > + uint16_t old_udp_checksum = ~uh->udp_csum; > > + > > + uint32_t udp_checksum = old_csum + tmp_checksum; > > + udp_checksum = csum_finish(udp_checksum); > > + uint16_t udp_csum = ~udp_checksum; > > + > > + uint32_t nw_udp_checksum = udp_csum + > > + old_udp_checksum; > > + > > + csum = csum_finish(nw_udp_checksum); > > + > > I think there is this special UDP case that might need handling here: > > if (!csum) { > csum = htons(0xffff); > > > + /* Insert new udp checksum. */ > > + v_res = _mm256_insert_epi16(v_res, csum, 13); > > + } > > + } > > Guess this could be an " } else if if (nh->ip_proto == IPPROTO_TCP) {" > > > + if (nh->ip_proto == IPPROTO_TCP) { > > + /* New TCP checksum. */ > > + struct tcp_header *th = dp_packet_l4(packet); > > + uint16_t old_tcp_checksum = ~th->tcp_csum; > > + > > + uint32_t tcp_checksum = old_csum + tmp_checksum; > > + tcp_checksum = csum_finish(tcp_checksum); > > + uint16_t tcp_csum = ~tcp_checksum; > > + > > + uint32_t nw_tcp_checksum = tcp_csum + > > + old_tcp_checksum; > > + > > + csum = csum_finish(nw_tcp_checksum); > > + > > + th->tcp_csum = csum; > > + } > > + } > > + > > + /* Store new IP header. */ > > + _mm256_storeu_si256((void *) nh, v_res); > > + } > > +} > > + > > Did not review the below, as the architecture needs changing (see the > previous patch). > > > static void > > action_avx512_set_masked(void *dp OVS_UNUSED, > > struct dp_packet_batch *batch OVS_UNUSED, @@ > > -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl > *self) > > self->funcs[OVS_ACTION_ATTR_SET_MASKED] = > action_avx512_set_masked; > > self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = > > action_avx512_eth_set_addrs; > > + self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = > > + action_avx512_ipv4_set_addrs; > > active_impl = *self; > > > > return 0; > > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c > > index 34f13523a..cb77bab31 100644 > > --- a/lib/odp-execute-private.c > > +++ b/lib/odp-execute-private.c > > @@ -284,6 +284,7 @@ action_autoval_init(struct > odp_execute_action_impl *self) > > self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = > action_autoval_generic; > > self->funcs[OVS_ACTION_ATTR_SET_MASKED] = > action_set_masked_init; > > self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = > > action_autoval_generic; > > + self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = > > + action_autoval_generic; > > active_impl = *self; > > > > return 0; > > diff --git a/lib/odp-execute.c b/lib/odp-execute.c index > > 5c4dd8e33..cbf528f93 100644 > > --- a/lib/odp-execute.c > > +++ b/lib/odp-execute.c > > @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct > dp_packet *packet, > > break; > > } > > > > - case OVS_KEY_ATTR_IPV4: > > - odp_set_ipv4(packet, nl_attr_get(a), > > - get_mask(a, struct ovs_key_ipv4)); > > - break; > > - > > case OVS_KEY_ATTR_IPV6: > > odp_set_ipv6(packet, nl_attr_get(a), > > get_mask(a, struct ovs_key_ipv6)); @@ -657,6 > > +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet, > > case OVS_KEY_ATTR_ETHERNET: > > case OVS_KEY_ATTR_ETHERTYPE: > > case OVS_KEY_ATTR_IN_PORT: > > + case OVS_KEY_ATTR_IPV4: > > case OVS_KEY_ATTR_VLAN: > > case OVS_KEY_ATTR_ICMP: > > case OVS_KEY_ATTR_ICMPV6: > > @@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct > dp_packet_batch *batch, > > } > > } > > > > +static void > > +action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch > *batch, > > + const struct nlattr *a OVS_UNUSED, > > + bool should_steal OVS_UNUSED) { > > + a = nl_attr_get(a); > > + struct dp_packet *packet; > > + > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + odp_set_ipv4(packet, nl_attr_get(a), > > + get_mask(a, struct ovs_key_ipv4)); > > + } > > +} > > + > > /* Implementation of the scalar actions impl init function. Build up the > > * array of func ptrs here. > > */ > > @@ -902,6 +912,7 @@ odp_action_scalar_init(struct > odp_execute_action_impl *self) > > self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan; > > self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked; > > self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = > action_mod_eth; > > + self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4; > > actions_active_impl = *self; > > > > return 0; > > -- > > 2.25.1 > > This completes my initial review of this series, let me know if something is > not clear. I have not done any actual testing on an AVX machine, but I will > try to do that on the next revision. > > Also, wondering how you tested performance on all of this? Maybe you can > some details to the cover letter on how the relative performance numbers > were gathered? Also wondering if you tested all of this without including > DPDK in your build (it was/is on my TODO but I have not AVX machine yet)? > > Cheers, > > Eelco
Hi Eelco, Thanks for the review. Will take the comments on board and start re working for the next revision. Thanks, Emma _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
