Only one real concern about the -1 usage, the rest looks fine (just some
questions).
//Eelco
On 28 Jan 2022, at 16:20, Harry van Haaren wrote:
> This commit fixes the minimum packet size for the vlan/ipv4/tcp
> traffic profile, which was previously incorrectly set.
>
> This commit also disallows any fragmented IPv4 packets from being
> matched in the optimized miniflow-extract, avoiding complexity of
> handling fragmented packets and using scalar fallback instead.
> The DF (don't fragment) bit is now ignored, and stripped from the
> resulting miniflow.
>
> Fixes: aa85a25095 ("dpif-netdev/mfex: Add more AVX512 traffic profiles.")
>
> Signed-off-by: Harry van Haaren <[email protected]>
>
> ---
>
> v2:
> - Fixup the "frag-offset" mask from incorrect value, to ignore DF bit (Eelco)
> - The OVS_UNLIKELY() is added as the extra instructions/inline-func-call
> was confusing the compiler here, resulting in slow code. By marking
> the branch as unlikely, the code sequence generated is optimal again, and
> the extra AND instruction has no measurable performance impact.
>
> ---
> lib/dpif-netdev-extract-avx512.c | 30 +++++++++++++++++++++++++-----
> 1 file changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/lib/dpif-netdev-extract-avx512.c
> b/lib/dpif-netdev-extract-avx512.c
> index d23349482..a35c73510 100644
> --- a/lib/dpif-netdev-extract-avx512.c
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -157,7 +157,7 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask,
> __m512i idx, __m512i a)
> 0, 0, 0, 0, /* Src IP */ \
> 0, 0, 0, 0, /* Dst IP */
>
> -#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
> +#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xBF, 0xFF, 0xFF)
Thanks for correcting my wrongly calculated bit mask ;)
> #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
> #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06)
>
> @@ -389,11 +389,28 @@ static const struct mfex_profile
> mfex_profiles[PROFILE_COUNT] =
> .dp_pkt_offs = {
> 14, UINT16_MAX, 18, 38,
> },
> - .dp_pkt_min_size = 46,
> + .dp_pkt_min_size = 58,
> },
> };
>
>
> +/* Static data to strip away the DF bit from an Eth/IPv4 miniflow. */
> +static union mfex_data eth_ipv4_df_strip_mask = {
> + .u8_data = {
> + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
> + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xBF, -1, -1, -1,
> + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
> + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
> +};
This is really confusing to me OCD brain, you define an unsigned u8 data and
pass in -1.
Guess you wanted 0xFF but looked for some way to do distinguish that you do not
care about those values?
If this is true, why not define something like you do in other parts of this
code with “#define NU 0”?
> +
> +static void ALWAYS_INLINE
> +mfex_ipv4_strip_df_bit(__m512i v_blk0, __m512i v_df_mask, uint64_t *blocks)
> +{
> + /* strip away ipv4 DF bit. */
> + __m512i v_blk0_df_strip = _mm512_and_si512(v_blk0, v_df_mask);
> + _mm512_storeu_si512(&blocks[2], v_blk0_df_strip);
> +}
> +
> /* Protocol specific helper functions, for calculating offsets/lenghts. */
> static int32_t
> mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh,
> @@ -471,6 +488,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> __m512i v_vals = _mm512_loadu_si512(&profile->probe_data);
> __m512i v_mask = _mm512_loadu_si512(&profile->probe_mask);
> __m512i v_shuf = _mm512_loadu_si512(&profile->store_shuf);
> + __m512i v_ipv4_df_mask = _mm512_loadu_si512(ð_ipv4_df_strip_mask);
Guess loading them always (even if we do not need them for other cases in the
future) will not harm performance?
And probably is needed to avoid a stall when executing this is
mfex_ipv4_strip_df_bit() itself.
>
> __mmask64 k_shuf = profile->store_kmsk;
> __m128i v_bits = _mm_loadu_si128((void *) &profile->mf_bits);
> @@ -498,7 +516,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
>
> __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_mask);
> __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, v_vals);
> - if (k_cmp != UINT64_MAX) {
> + if (OVS_UNLIKELY(k_cmp != UINT64_MAX)) {
> continue;
> }
>
> @@ -526,8 +544,6 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0,
> v_shuf, v512_zeros);
> }
> - _mm512_storeu_si512(&blocks[2], v_blk0);
Moving this store to mfex_ipv4_strip_df_bit() might hide it when people add new
cases?
But I guess the tests will fail and they find out quickly…
>
> /* Perform "post-processing" per profile, handling details not easily
> * handled in the above generic AVX512 code. Examples include TCP
> flag
> @@ -539,6 +555,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> break;
>
> case PROFILE_ETH_VLAN_IPV4_TCP: {
> + mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
> mfex_vlan_pcp(pkt[14], &keys[i].buf[4]);
>
> uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN;
> @@ -554,6 +571,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> } break;
>
> case PROFILE_ETH_VLAN_IPV4_UDP: {
> + mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
> mfex_vlan_pcp(pkt[14], &keys[i].buf[4]);
>
> uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN;
> @@ -565,6 +583,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> } break;
>
> case PROFILE_ETH_IPV4_TCP: {
> + mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
> /* Process TCP flags, and store to blocks. */
> const struct tcp_header *tcp = (void *)&pkt[34];
> mfex_handle_tcp_flags(tcp, &blocks[6]);
> @@ -579,6 +598,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> } break;
>
> case PROFILE_ETH_IPV4_UDP: {
> + mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
> /* Handle dynamic l2_pad_size. */
> uint32_t size_from_ipv4 = size - sizeof(struct eth_header);
> struct ip_header *nh = (void *)&pkt[sizeof(struct
> eth_header)];
> --
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev