Re: [ovs-dev] [PATCH v3 6/7] dpif-lookup: add avx512 gather implementation

William Tu Tue, 16 Jun 2020 08:42:18 -0700

On Wed, Jun 10, 2020 at 3:47 AM Harry van Haaren
<[email protected]> wrote:
>
> This commit adds an AVX-512 dpcls lookup implementation.
> It uses the AVX-512 SIMD ISA to perform multiple miniflow
> operations in parallel.
>
> To run this implementation, the "avx512f" and "bmi2" ISAs are
> required. These ISA checks are performed at runtime while
> probing the subtable implementation. If a CPU does not provide
> both "avx512f" and "bmi2", then this code does not execute.
>
> The avx512 code is built as a seperate static library, with added
> CFLAGS to enable the required ISA features. By building only this
> static library with avx512 enabled, it is ensured that the main OVS
> core library is *not* using avx512, and that OVS continues to run
> as before on CPUs that do not support avx512.
>
> The approach taken in this implementation is to use the
> gather instruction to access the packet miniflow, allowing
> any miniflow blocks to be loaded into an AVX-512 register.
> This maximises the usefulness of the register, and hence this
> implementation handles any subtable with up to miniflow 8 bits.
>
> Note that specialization of these avx512 lookup routines
> still provides performance value, as the hashing of the
> resulting data is performed in scalar code, and compile-time
> loop unrolling occurs when specialized to miniflow bits.
>
> Signed-off-by: Harry van Haaren <[email protected]>
>
> ---
>
> v3:
> - Improve function name for _any subtable lookup
> - Use "" include not <> for immintrin.h
> - Add checks for SSE42 instructions in core OVS for CRC32 based hashing
>   If not available, disable AVX512 lookup implementation as it requires
>   uses CRC32 for hashing, and the hashing algorithm must match core OVS.
>   Issue a #warning when building x86_64 without SSE42 for core OVS.


Where did you add this warning?

> - Rework ovs_asserts() into function selection time check
> - Add #define for magic number 8, number of u64 blocks in AVX512 register
> - Add #if CHECKER around AVX code, sparse doesn't like checking it
> - Remove #warning if SSE42 isn't available. There is now no message if
>   the AVX512 routines are not being compiled into the binary, however
>   the "subtable-lookup-get" command will not return it in the list.
>
> hvh: comment #warning for crc32 sse42 isa
>
> Signed-off-by: Harry van Haaren <[email protected]>
>
> hvh: avx512 add #if __CHECKER__
>
> Signed-off-by: Harry van Haaren <[email protected]>
> ---
>  lib/automake.mk                        |  16 ++
>  lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
>  lib/dpif-netdev-lookup.c               |  15 ++
>  lib/dpif-netdev-lookup.h               |   7 +
>  lib/dpif-netdev.c                      |   4 +
>  5 files changed, 307 insertions(+)
>  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 19e454c4b..d8a05b384 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -8,13 +8,16 @@
>  # libopenvswitch.la is the library to link against for binaries like 
> vswitchd.
>  # The code itself is built as two seperate static libraries;
>  # - core: Core files, always compiled with distro provided CFLAGS
> +# - lookupavx512: ISA optimized routines that require CPUID checks at runtime
>  lib_LTLIBRARIES += lib/libopenvswitch.la
>  lib_LTLIBRARIES += lib/libopenvswitchcore.la
> +lib_LTLIBRARIES += lib/libopenvswitchlookupavx512.la
>
>  # Dummy library to link against doesn't have any sources, but does
>  # depend on libopenvswitchcore static library
>  lib_libopenvswitch_la_SOURCES =
>  lib_libopenvswitch_la_LIBADD = lib/libopenvswitchcore.la
> +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchlookupavx512.la
>
>  # Dummy library continues to depend on external libraries as before
>  lib_libopenvswitch_la_LIBADD += $(SSL_LIBS)
> @@ -31,6 +34,19 @@ lib_libopenvswitch_la_LDFLAGS = \
>          $(lib_libopenvswitchcore_la_LIBS) \
>          $(AM_LDFLAGS)
>
> +
> +# Build lookupavx512 library with extra CFLAGS enabled. This allows the
> +# compiler to use the ISA features required for the ISA optimized code-paths.
> +lib_libopenvswitchlookupavx512_la_CFLAGS = \
> +       -mavx512f \
> +       -mavx512bw \
> +       -mavx512dq \
> +       -mbmi2 \
> +       $(AM_CFLAGS)
> +lib_libopenvswitchlookupavx512_la_SOURCES = \
> +       lib/dpif-netdev-lookup-avx512-gather.c
> +
the robot is showing error
gcc: error: unrecognized command line option '-mavx512f'
looks like the older version of gcc doesn't have the option.
I don't know a better way to check gcc flags support, maybe add
a check at acinclude.m4, the _OVS_CHECK_CC_OPTION?


William
> +
>  # Build core vswitch libraries as before
>  lib_libopenvswitchcore_la_SOURCES = \
>         lib/aes128.c \
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
> b/lib/dpif-netdev-lookup-avx512-gather.c
> new file mode 100644
> index 000000000..754cd0e3c
> --- /dev/null
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -0,0 +1,265 @@
> +/*
> + * Copyright (c) 2020, Intel Corperation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +#if !defined(__CHECKER__)
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-lookup.h"
> +#include "dpif-netdev-private.h"
> +#include "cmap.h"
> +#include "flow.h"
> +#include "pvector.h"
> +#include "openvswitch/vlog.h"
> +
> +#include "immintrin.h"
> +
> +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum
> + * number of miniflow blocks that can be processed in a single pass of the
> + * AVX512 code at a time.
> + */
> +#define NUM_U64_IN_ZMM_REG (8)
> +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
> +
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> +
> +static inline __m512i
> +_mm512_popcnt_epi64_manual(__m512i v_in)
> +{
> +    static const uint8_t pop_lut[64] = {
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +    };
> +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> +
> +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> +
> +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> +
> +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> +}
> +
> +static inline uint64_t
> +netdev_rule_matches_key(const struct dpcls_rule *rule,
> +                        const uint32_t mf_bits_total,
> +                        const uint64_t * block_cache)
> +{
> +    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
> +    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
> +    const uint32_t lane_mask = (1 << mf_bits_total) - 1;
> +
> +    /* Always load a full cache line from blocks_cache. Other loads must be
> +     * trimmed to the amount of data required for mf_bits_total blocks.
> +     */
> +    __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
> +    __m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
> +    __m512i v_key    = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
> +
> +    __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
> +    uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, 
> v_key);
> +
> +    /* returns 1 assuming result of SIMD compare is all blocks */
> +    return res_mask == lane_mask;
> +}
> +
> +static inline uint32_t ALWAYS_INLINE
> +avx512_lookup_impl(struct dpcls_subtable *subtable,
> +                   uint32_t keys_map,
> +                   const struct netdev_flow_key *keys[],
> +                   struct dpcls_rule **rules,
> +                   const uint32_t bit_count_u0,
> +                   const uint32_t bit_count_u1)
> +{
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
> +
> +    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
> +    int i;
> +    uint32_t hashes[NETDEV_MAX_BURST];
> +    const uint32_t n_pkts = __builtin_popcountll(keys_map);
> +    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
> +
> +    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
> +    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
> +
> +    /* Load subtable blocks for masking later */
> +    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
> +    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
> +
> +    /* Load pre-created subtable masks for each block in subtable */
> +    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
> +    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
> +                                                        subtable->mf_masks);
> +
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
> +        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
> +
> +        /* Pre-create register with *PER PACKET* u0 offset */
> +        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
> +        const __m512i v_idx_u0_offset = 
> _mm512_maskz_set1_epi64(u1_bcast_mask,
> +                                                                
> pkt_mf_u0_pop);
> +
> +        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
> +        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
> +        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
> +                                         keys[i]->mf.map.bits[1]);
> +
> +        /* Bitmask by pre-created masks */
> +        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
> +
> +        /* Manual AVX512 popcount for u64 lanes */
> +        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +
> +        /* Offset popcounts for u1 with pre-created offset register */
> +        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
> +
> +        /* Gather u64 blocks from packet miniflow */
> +        const __m512i v_zeros = _mm512_setzero_si512();
> +        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
> +        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
> +                                   bit_count_total_mask, v_indexes,
> +                                   pkt_data, 8);
> +
> +        /* Zero out bits that pkt doesn't have:
> +         * - 2x pext() to extract bits from packet miniflow as needed by TBL
> +         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
> +         */
> +         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
> +         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
> +         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
> +
> +        /* Mask blocks using AND with subtable blocks, use k-mask to zero
> +         * where lanes as required for this packet.
> +         */
> +        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
> +                                                v_all_blocks, v_tbl_blocks);
> +
> +        /* Store to blocks cache, full cache line aligned */
> +        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
> +    }
> +
> +    /* Hash the now linearized blocks of packet metadata. */
> +    ULLONG_FOR_EACH_1 (i, keys_map) {
> +        uint64_t *block_ptr = &block_cache[i * 8];
> +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> +    }
> +
> +    /* Lookup: this returns a bitmask of packets where the hash table had
> +     * an entry for the given hash key. Presence of a hash key does not
> +     * guarantee matching the key, as there can be hash collisions.
> +     */
> +    uint32_t found_map;
> +    const struct cmap_node *nodes[NETDEV_MAX_BURST];
> +    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
> +
> +    /* Verify that packet actually matched rule. If not found, a hash
> +     * collision has taken place, so continue searching with the next node.
> +     */
> +    ULLONG_FOR_EACH_1 (i, found_map) {
> +        struct dpcls_rule *rule;
> +
> +        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> +            const uint32_t cidx = i * 8;
> +            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
> +                                                     &block_cache[cidx]);
> +            if (OVS_LIKELY(match)) {
> +                rules[i] = rule;
> +                subtable->hit_cnt++;
> +                goto next;
> +            }
> +        }
> +
> +        /* None of the found rules was a match.  Clear the i-th bit to
> +         * search for this key in the next subtable. */
> +        ULLONG_SET0(found_map, i);
> +    next:
> +        ;                     /* Keep Sparse happy. */
> +    }
> +
> +    return found_map;
> +}
> +
> +/* Expand out specialized functions with U0 and U1 bit attributes. */
> +#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                            
>  \
> +    static uint32_t                                                          
>  \
> +    dpcls_avx512_gather_skx_mf_##U0##_##U1(                                  
>  \
> +                                         struct dpcls_subtable *subtable,    
>  \
> +                                         uint32_t keys_map,                  
>  \
> +                                         const struct netdev_flow_key 
> *keys[],\
> +                                         struct dpcls_rule **rules)          
>  \
> +    {                                                                        
>  \
> +        return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);  
>  \
> +    }                                                                        
>  \
> +
> +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
> +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
> +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
> +
> +/* Check if a specialized function is valid for the required subtable. */
> +#define CHECK_LOOKUP_FUNCTION(U0, U1)                                        
>  \
> +    ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG);                             
>  \
> +    if (!f && u0_bits == U0 && u1_bits == U1) {                              
>  \
> +        f = dpcls_avx512_gather_skx_mf_##U0##_##U1;                          
>  \
> +    }
> +
> +static uint32_t
> +dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t 
> keys_map,
> +                           const struct netdev_flow_key *keys[],
> +                           struct dpcls_rule **rules)
> +{
> +    return avx512_lookup_impl(subtable, keys_map, keys, rules,
> +                              subtable->mf_bits_set_unit0,
> +                              subtable->mf_bits_set_unit1);
> +}
> +
> +dpcls_subtable_lookup_func
> +dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
> +{
> +    dpcls_subtable_lookup_func f = NULL;
> +
> +    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
> +    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
> +    if (!avx512f_available || !bmi2_available) {
> +        return NULL;
> +    }
> +
> +    CHECK_LOOKUP_FUNCTION(5, 1);
> +    CHECK_LOOKUP_FUNCTION(4, 1);
> +    CHECK_LOOKUP_FUNCTION(4, 0);
> +
> +    if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
> +        f = dpcls_avx512_gather_mf_any;
> +        VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
> +                  u0_bits, u1_bits);
> +    }
> +
> +    return f;
> +}
> +
> +#endif /* CHECKER */
> +#endif /* __x86_64__ */
> diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
> index 1aa0c3c0c..c324407b3 100644
> --- a/lib/dpif-netdev-lookup.c
> +++ b/lib/dpif-netdev-lookup.c
> @@ -34,6 +34,21 @@ static struct dpcls_subtable_lookup_info_t 
> subtable_lookups[] = {
>      { .prio = 1,
>        .probe = dpcls_subtable_generic_probe,
>        .name = "generic", },
> +
> +#ifdef __x86_64__
> +#ifdef __SSE4_2__
> +    /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */
> +    { .prio = 0,
> +      .probe = dpcls_subtable_avx512_gather_probe,
> +      .name = "avx512_gather", },
> +#else
> +    /* Disabling AVX512 at compile time, due to core OVS not using SSE42
> +     * instruction set. The SSE42 instructions are required to use CRC32
> +     * ISA for high performance hashing. Consider ./configure of OVS with
> +     * -msse42 (or newer) to enable CRC32 hashing and higher performance.
> +     */
> +#endif
> +#endif
>  };
>
>  int32_t
> diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
> index 61f44b9e8..07a9bf694 100644
> --- a/lib/dpif-netdev-lookup.h
> +++ b/lib/dpif-netdev-lookup.h
> @@ -21,6 +21,9 @@
>  #include "dpif-netdev.h"
>  #include "dpif-netdev-private.h"
>
> +/* Extreme debugging for developers only */
> +#define DPIF_NETDEV_LOOKUP_DATAPATH_DEBUG 1
> +
>  /* Function to perform a probe for the subtable bit fingerprint.
>   * Returns NULL if not valid, or a valid function pointer to call for this
>   * subtable on success.
> @@ -42,6 +45,10 @@ dpcls_subtable_autovalidator_probe(uint32_t u0_bit_count,
>  dpcls_subtable_lookup_func
>  dpcls_subtable_generic_probe(uint32_t u0_bit_count, uint32_t u1_bit_count);
>
> +/* Probe function for AVX-512 gather implementation */
> +dpcls_subtable_lookup_func
> +dpcls_subtable_avx512_gather_probe(uint32_t u0_bit_cnt, uint32_t u1_bit_cnt);
> +
>
>  /* Subtable registration and iteration helpers */
>  struct dpcls_subtable_lookup_info_t {
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index cd4e1dbb1..cbd525100 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -1292,6 +1292,10 @@ static void
>  dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
>                                  const char *argv[], void *aux OVS_UNUSED)
>  {
> +    /* TODO: If less than 2 parameters are provided return a list of
> +     * known dpcls implementations compiled in?
> +     */
> +
>      /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
>       *   argv[1] is subtable name
>       *   argv[2] is priority
> --
> 2.17.1
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v3 6/7] dpif-lookup: add avx512 gather implementation

Reply via email to