Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather implementation

William Tu Mon, 18 May 2020 07:58:13 -0700

On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote:
> This commit adds an AVX-512 dpcls lookup implementation.
> It uses the AVX-512 SIMD ISA to perform multiple miniflow
> operations in parallel.
> 
> To run this implementation, the "avx512f" and "bmi2" ISAs are
> required. These ISA checks are performed at runtime while
> probing the subtable implementation. If a CPU does not provide
> both "avx512f" and "bmi2", then this code does not execute.
> 
> The avx512 code is built as a seperate static library, with added
> CFLAGS to enable the required ISA features. By building only this
> static library with avx512 enabled, it is ensured that the main OVS
> core library is *not* using avx512, and that OVS continues to run
> as before on CPUs that do not support avx512.
> 
> The approach taken in this implementation is to use the
> gather instruction to access the packet miniflow, allowing
> any miniflow blocks to be loaded into an AVX-512 register.
> This maximises the usefulness of the register, and hence this
> implementation handles any subtable with up to miniflow 8 bits.
> 
> Note that specialization of these avx512 lookup routines
> still provides performance value, as the hashing of the
> resulting data is performed in scalar code, and compile-time
> loop unrolling occurs when specialized to miniflow bits.
>


Hi Harry,

I haven't tried running the code due to my machine only
support avx2. There are some minor issues such as indentation.
But I read through it with example below and I think it's correct.

Given that you have to do a lot of preparation (ex: popcount, creating
bit_masks, broadcast, ... etc) before using avx instructions, do you
have some performance number? I didn't see any from ovsconf 18 or 19.
Is using avx512 much better than avx2?

> Signed-off-by: Harry van Haaren <[email protected]>
> ---
>  lib/automake.mk                        |  16 ++
>  lib/dpif-netdev-lookup-avx512-gather.c | 255 +++++++++++++++++++++++++
>  lib/dpif-netdev-lookup.c               |   7 +
>  lib/dpif-netdev-lookup.h               |   7 +
>  lib/dpif-netdev.c                      |   4 +
>  5 files changed, 289 insertions(+)
>  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 19e454c4b..d8a05b384 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -8,13 +8,16 @@
>  # libopenvswitch.la is the library to link against for binaries like 
> vswitchd.
>  # The code itself is built as two seperate static libraries;
>  # - core: Core files, always compiled with distro provided CFLAGS
> +# - lookupavx512: ISA optimized routines that require CPUID checks at runtime
>  lib_LTLIBRARIES += lib/libopenvswitch.la
>  lib_LTLIBRARIES += lib/libopenvswitchcore.la
> +lib_LTLIBRARIES += lib/libopenvswitchlookupavx512.la
>  
>  # Dummy library to link against doesn't have any sources, but does
>  # depend on libopenvswitchcore static library
>  lib_libopenvswitch_la_SOURCES =
>  lib_libopenvswitch_la_LIBADD = lib/libopenvswitchcore.la
> +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchlookupavx512.la
>  
>  # Dummy library continues to depend on external libraries as before
>  lib_libopenvswitch_la_LIBADD += $(SSL_LIBS)
> @@ -31,6 +34,19 @@ lib_libopenvswitch_la_LDFLAGS = \
>          $(lib_libopenvswitchcore_la_LIBS) \
>          $(AM_LDFLAGS)
>  
> +
> +# Build lookupavx512 library with extra CFLAGS enabled. This allows the
> +# compiler to use the ISA features required for the ISA optimized code-paths.
> +lib_libopenvswitchlookupavx512_la_CFLAGS = \
> +     -mavx512f \
> +     -mavx512bw \
> +     -mavx512dq \
> +     -mbmi2 \
> +     $(AM_CFLAGS)
> +lib_libopenvswitchlookupavx512_la_SOURCES = \
> +     lib/dpif-netdev-lookup-avx512-gather.c
> +
> +
>  # Build core vswitch libraries as before
>  lib_libopenvswitchcore_la_SOURCES = \
>       lib/aes128.c \
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
> b/lib/dpif-netdev-lookup-avx512-gather.c
> new file mode 100644
> index 000000000..52348041b
> --- /dev/null
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -0,0 +1,255 @@
> +/*
> + * Copyright (c) 2020, Intel Corperation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-lookup.h"
> +#include "dpif-netdev-private.h"
> +#include "cmap.h"
> +#include "flow.h"
> +#include "pvector.h"
> +#include "openvswitch/vlog.h"
> +
> +#include <immintrin.h>
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> +
> +static inline __m512i
> +_mm512_popcnt_epi64_manual(__m512i v_in)
> +{
> +    static const uint8_t pop_lut[64] = {
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +    };
> +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> +
> +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> +
> +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> +
> +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> +}
> +
> +static inline uint64_t
> +netdev_rule_matches_key(const struct dpcls_rule *rule,
> +                        const uint32_t mf_bits_total,
> +                        const uint64_t * block_cache)
> +{
> +    ovs_assert(mf_bits_total <= 8);
> +    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
> +    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
> +    const uint32_t lane_mask = (1 << mf_bits_total) - 1;
> +
> +    /* Always load a full cache line from blocks_cache. Other loads must be
> +     * trimmed to the amount of data required for mf_bits_total blocks.
> +     */
> +    __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
> +    __m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
> +    __m512i v_key    = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
> +
> +    __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
> +    uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, 
> v_key);
> +
> +    /* returns 1 assuming result of SIMD compare is all blocks */
> +    return res_mask == lane_mask;
> +}
> +

I think the below function is the most difficult one.
I wonder if there is a better way to make it easier to understand?
ex: break it into subfunctions or utility functions
I end up using an example from your slides 2 here:
https://www.openvswitch.org/support/ovscon2019/day1/1108-next_steps_sw_datapath_hvh.pdf
and the API document here
https://software.intel.com/sites/landingpage/IntrinsicsGuide/

> +static inline uint32_t ALWAYS_INLINE
> +avx512_lookup_impl(struct dpcls_subtable *subtable,
> +                   uint32_t keys_map,
> +                   const struct netdev_flow_key *keys[],
> +                   struct dpcls_rule **rules,
> +                   const uint32_t bit_count_u0,
> +                   const uint32_t bit_count_u1)
> +{
> +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> +    int i;
> +    uint32_t hashes[NETDEV_MAX_BURST];
> +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> +
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 
> 8];
> +
> +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> +    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
> +    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
> +
> +    /* Load subtable blocks for masking later */
> +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> +
> +    /* Load pre-created subtable masks for each block in subtable */
> +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> +    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
> +                                                        subtable->mf_masks);
> +
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> +
> +        /* Pre-create register with *PER PACKET* u0 offset */
> +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> +        const __m512i v_idx_u0_offset = 
> _mm512_maskz_set1_epi64(u1_bcast_mask,
> +                                                                
> pkt_mf_u0_pop);
> +
> +        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> +                                         keys[i]->mf.map.bits[1]);
> +
> +        /* Bitmask by pre-created masks */
> +        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> +
> +        /* Manual AVX512 popcount for u64 lanes */
> +        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +
> +        /* Offset popcounts for u1 with pre-created offset register */
> +        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> +
> +        /* Gather u64 on single packet, merge with zero reg, up to 8 blocks 
> */
> +        const __m512i v_zeros = _mm512_setzero_si512();
> +        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> +        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> +                                 bit_count_total_mask, v_indexes, pkt_data, 
> 8);
indent

> +
> +        /* Zero out bits that pkt doesn't have:
> +         * - 2x pext() to extract bits from packet miniflow as needed by TBL
> +         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> +         */
> +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> +         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
indentation: remove one space
> +
> +        /* Mask blocks using AND with subtable blocks, use k-mask to zero
> +         * where lanes as required for this packet.
> +         */
> +        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
> +                                                v_all_blocks, v_tbl_blocks);
> +
> +        /* Store to blocks cache, full cache line aligned */
> +        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
> +    }
> +
> +    /* Hash the now linearized blocks of packet metadata. */
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        uint64_t *block_ptr = &block_cache[i * 8];
> +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> +    }
> +
> +    /* Lookup: this returns a bitmask of packets where the hash table had
> +     * an entry for the given hash key. Presence of a hash key does not
> +     * guarantee matching the key, as there can be hash collisions.
> +     */
> +    uint32_t found_map;
> +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
> +
> +    /* Verify that packet actually matched rule. If not found, a hash
> +     * collision has taken place, so continue searching with the next node.
> +     */
> +    ULLONG_FOR_EACH_1 (i, found_map) {
> +        struct dpcls_rule *rule;
> +
> +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> +            const uint32_t cidx = i * 8;
> +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> +                                                     &block_cache[cidx]);
> +            if (OVS_LIKELY(match)) {
> +                rules[i] = rule;
> +                subtable->hit_cnt++;
> +                goto next;
> +            }
> +        }
> +
> +        /* None of the found rules was a match.  Clear the i-th bit to
> +         * search for this key in the next subtable. */
> +        ULLONG_SET0(found_map, i);
> +    next:
> +        ;                     /* Keep Sparse happy. */
> +    }
> +
> +    return found_map;
> +}

If someone is interested, the example below with the slides
help understand the above function.

diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 52348041bd00..f84a95423cf8 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -93,56 +93,77 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 
     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 8];
 
-    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
-    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
-    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0);
-    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1);
+    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; //1000,0000
+    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; //0100,0000
+    ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); //1
+    ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); //1
+    // bit_count_total = 2
 
     /* Load subtable blocks for masking later */
-    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
-    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+    const uint64_t *tbl_blocks = 
miniflow_get_values(&subtable->mask.mf);//point to ipv4 mask
+    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);    
//porint to ipv4 mask
 
     /* Load pre-created subtable masks for each block in subtable */
-    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
-    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; // (1 << 
2) - 1 = 0x3
+    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask 
/* 0x3 */,
                                                         subtable->mf_masks);
+    // subtable->mf_masks[0] = 0b01111111
+    // subtable->mf_masks[1] = 0b00111111
+    // v_mf_masks = [0,0,0,0,0,0, 0b00111111, 0b01111111]
 
-    ULLONG_FOR_EACH_1 (i, keys_map) {
-        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
-        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+    ULLONG_FOR_EACH_1 (i, keys_map) {// for each packets in batch
+        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; //0b1000,0100
+        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); 
//2
 
         /* Pre-create register with *PER PACKET* u0 offset */
-        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); //(0xff << 
1) = 0xfe
         const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
                                                                 pkt_mf_u0_pop);
+        //v_idx_u0_offset = [2,2,2,2,2,2,2,0]

         /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
-        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
-        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
-                                         keys[i]->mf.map.bits[1]);
+        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);// 
[0b10000100,0b10000100,0b10000100, ...]
+
+        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask 
/*0xfe*/,
+                                         keys[i]->mf.map.bits[1] /* 0b01100000 
*/);
+        //0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000, 
0b01100000, 0b01100000,0b10000100
 
-        /* Bitmask by pre-created masks */
+
+        /* Bitmask by pre-created masks. */
         __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+        // v_masks = [0,0,0,0,0,0, 0b00100000,0b00000100]
 
         /* Manual AVX512 popcount for u64 lanes */
         __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+        // v_popcnts = [0,0,0,0,0,0,1,1]
 
         /* Offset popcounts for u1 with pre-created offset register */
         __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+        // v_indexes = [0,0,0,0,0,0,3,1]
 
         /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */
         const __m512i v_zeros = _mm512_setzero_si512();
         const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
+        // pkt_data = ipv4_src, ipv4_dst, mac_src, vlan_tci
+
         __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
-                                 bit_count_total_mask, v_indexes, pkt_data, 8);
+                                 bit_count_total_mask /* 0x3 */,
+                                 v_indexes, pkt_data, 8);
+        //v_all_blocks: use v_index[0]=1*8 , v_index[1]=3*8 to gather data
+        //v_all_blocks = [0,0,0,0,0,0, ipv4_dst, vlan_tci]
 
         /* Zero out bits that pkt doesn't have:
          * - 2x pext() to extract bits from packet miniflow as needed by TBL
          * - Shift u1 over by bit_count of u0, OR to create zero bitmask
          */
-         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
-         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0] /* 
0b1000,0100*/,
+                                         tbl_u0 /* 0b1000,0000 */);
+         // u0_to_zero = 0b00000001
+         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1] /* 0b0110, 
0000*/,
+                                         tbl_u1 /* 0b0100,0000 */);
+         // u1_to_zero = 0b00000001
          uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+        // 0b00000011
 
         /* Mask blocks using AND with subtable blocks, use k-mask to zero
          * where lanes as required for this packet.

---
Pretty cool piece of code. Thanks!

William

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2 5/5] dpif-lookup: add avx512 gather implementation

Reply via email to