On Wed, May 06, 2020 at 02:06:09PM +0100, Harry van Haaren wrote: > This commit adds an AVX-512 dpcls lookup implementation. > It uses the AVX-512 SIMD ISA to perform multiple miniflow > operations in parallel. > > To run this implementation, the "avx512f" and "bmi2" ISAs are > required. These ISA checks are performed at runtime while > probing the subtable implementation. If a CPU does not provide > both "avx512f" and "bmi2", then this code does not execute. > > The avx512 code is built as a seperate static library, with added > CFLAGS to enable the required ISA features. By building only this > static library with avx512 enabled, it is ensured that the main OVS > core library is *not* using avx512, and that OVS continues to run > as before on CPUs that do not support avx512. > > The approach taken in this implementation is to use the > gather instruction to access the packet miniflow, allowing > any miniflow blocks to be loaded into an AVX-512 register. > This maximises the usefulness of the register, and hence this > implementation handles any subtable with up to miniflow 8 bits. > > Note that specialization of these avx512 lookup routines > still provides performance value, as the hashing of the > resulting data is performed in scalar code, and compile-time > loop unrolling occurs when specialized to miniflow bits. >
Hi Harry, I haven't tried running the code due to my machine only support avx2. There are some minor issues such as indentation. But I read through it with example below and I think it's correct. Given that you have to do a lot of preparation (ex: popcount, creating bit_masks, broadcast, ... etc) before using avx instructions, do you have some performance number? I didn't see any from ovsconf 18 or 19. Is using avx512 much better than avx2? > Signed-off-by: Harry van Haaren <[email protected]> > --- > lib/automake.mk | 16 ++ > lib/dpif-netdev-lookup-avx512-gather.c | 255 +++++++++++++++++++++++++ > lib/dpif-netdev-lookup.c | 7 + > lib/dpif-netdev-lookup.h | 7 + > lib/dpif-netdev.c | 4 + > 5 files changed, 289 insertions(+) > create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c > > diff --git a/lib/automake.mk b/lib/automake.mk > index 19e454c4b..d8a05b384 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -8,13 +8,16 @@ > # libopenvswitch.la is the library to link against for binaries like > vswitchd. > # The code itself is built as two seperate static libraries; > # - core: Core files, always compiled with distro provided CFLAGS > +# - lookupavx512: ISA optimized routines that require CPUID checks at runtime > lib_LTLIBRARIES += lib/libopenvswitch.la > lib_LTLIBRARIES += lib/libopenvswitchcore.la > +lib_LTLIBRARIES += lib/libopenvswitchlookupavx512.la > > # Dummy library to link against doesn't have any sources, but does > # depend on libopenvswitchcore static library > lib_libopenvswitch_la_SOURCES = > lib_libopenvswitch_la_LIBADD = lib/libopenvswitchcore.la > +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchlookupavx512.la > > # Dummy library continues to depend on external libraries as before > lib_libopenvswitch_la_LIBADD += $(SSL_LIBS) > @@ -31,6 +34,19 @@ lib_libopenvswitch_la_LDFLAGS = \ > $(lib_libopenvswitchcore_la_LIBS) \ > $(AM_LDFLAGS) > > + > +# Build lookupavx512 library with extra CFLAGS enabled. This allows the > +# compiler to use the ISA features required for the ISA optimized code-paths. > +lib_libopenvswitchlookupavx512_la_CFLAGS = \ > + -mavx512f \ > + -mavx512bw \ > + -mavx512dq \ > + -mbmi2 \ > + $(AM_CFLAGS) > +lib_libopenvswitchlookupavx512_la_SOURCES = \ > + lib/dpif-netdev-lookup-avx512-gather.c > + > + > # Build core vswitch libraries as before > lib_libopenvswitchcore_la_SOURCES = \ > lib/aes128.c \ > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c > b/lib/dpif-netdev-lookup-avx512-gather.c > new file mode 100644 > index 000000000..52348041b > --- /dev/null > +++ b/lib/dpif-netdev-lookup-avx512-gather.c > @@ -0,0 +1,255 @@ > +/* > + * Copyright (c) 2020, Intel Corperation. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifdef __x86_64__ > + > +#include <config.h> > + > +#include "dpif-netdev.h" > +#include "dpif-netdev-lookup.h" > +#include "dpif-netdev-private.h" > +#include "cmap.h" > +#include "flow.h" > +#include "pvector.h" > +#include "openvswitch/vlog.h" > + > +#include <immintrin.h> > + > +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather); > + > +static inline __m512i > +_mm512_popcnt_epi64_manual(__m512i v_in) > +{ > + static const uint8_t pop_lut[64] = { > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + }; > + __m512i v_pop_lut = _mm512_loadu_si512(pop_lut); > + > + __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4); > + __m512i v_nibble_mask = _mm512_set1_epi8(0xF); > + __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask); > + __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask); > + > + __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo); > + __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi); > + __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop); > + > + return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512()); > +} > + > +static inline uint64_t > +netdev_rule_matches_key(const struct dpcls_rule *rule, > + const uint32_t mf_bits_total, > + const uint64_t * block_cache) > +{ > + ovs_assert(mf_bits_total <= 8); > + const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); > + const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); > + const uint32_t lane_mask = (1 << mf_bits_total) - 1; > + > + /* Always load a full cache line from blocks_cache. Other loads must be > + * trimmed to the amount of data required for mf_bits_total blocks. > + */ > + __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]); > + __m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]); > + __m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]); > + > + __m512i v_data = _mm512_and_si512(v_blocks, v_mask); > + uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, > v_key); > + > + /* returns 1 assuming result of SIMD compare is all blocks */ > + return res_mask == lane_mask; > +} > + I think the below function is the most difficult one. I wonder if there is a better way to make it easier to understand? ex: break it into subfunctions or utility functions I end up using an example from your slides 2 here: https://www.openvswitch.org/support/ovscon2019/day1/1108-next_steps_sw_datapath_hvh.pdf and the API document here https://software.intel.com/sites/landingpage/IntrinsicsGuide/ > +static inline uint32_t ALWAYS_INLINE > +avx512_lookup_impl(struct dpcls_subtable *subtable, > + uint32_t keys_map, > + const struct netdev_flow_key *keys[], > + struct dpcls_rule **rules, > + const uint32_t bit_count_u0, > + const uint32_t bit_count_u1) > +{ > + const uint32_t bit_count_total = bit_count_u0 + bit_count_u1; > + int i; > + uint32_t hashes[NETDEV_MAX_BURST]; > + const uint32_t n_pkts = __builtin_popcountll(keys_map); > + ovs_assert(NETDEV_MAX_BURST >= n_pkts); > + > + OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * > 8]; > + > + const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; > + const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; > + ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); > + ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); > + > + /* Load subtable blocks for masking later */ > + const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf); > + const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]); > + > + /* Load pre-created subtable masks for each block in subtable */ > + const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; > + const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask, > + subtable->mf_masks); > + > + ULLONG_FOR_EACH_1 (i, keys_map) { > + const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; > + const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); > + > + /* Pre-create register with *PER PACKET* u0 offset */ > + const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); > + const __m512i v_idx_u0_offset = > _mm512_maskz_set1_epi64(u1_bcast_mask, > + > pkt_mf_u0_pop); > + > + /* Broadcast u0, u1 bitmasks to 8x u64 lanes */ > + __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits); > + __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask, > + keys[i]->mf.map.bits[1]); > + > + /* Bitmask by pre-created masks */ > + __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks); > + > + /* Manual AVX512 popcount for u64 lanes */ > + __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks); > + > + /* Offset popcounts for u1 with pre-created offset register */ > + __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset); > + > + /* Gather u64 on single packet, merge with zero reg, up to 8 blocks > */ > + const __m512i v_zeros = _mm512_setzero_si512(); > + const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf); > + __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros, > + bit_count_total_mask, v_indexes, pkt_data, > 8); indent > + > + /* Zero out bits that pkt doesn't have: > + * - 2x pext() to extract bits from packet miniflow as needed by TBL > + * - Shift u1 over by bit_count of u0, OR to create zero bitmask > + */ > + uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0); > + uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1); > + uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero; indentation: remove one space > + > + /* Mask blocks using AND with subtable blocks, use k-mask to zero > + * where lanes as required for this packet. > + */ > + __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask, > + v_all_blocks, v_tbl_blocks); > + > + /* Store to blocks cache, full cache line aligned */ > + _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks); > + } > + > + /* Hash the now linearized blocks of packet metadata. */ > + ULLONG_FOR_EACH_1 (i, keys_map) { > + uint64_t *block_ptr = &block_cache[i * 8]; > + uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total); > + hashes[i] = hash_finish(hash, bit_count_total * 8); > + } > + > + /* Lookup: this returns a bitmask of packets where the hash table had > + * an entry for the given hash key. Presence of a hash key does not > + * guarantee matching the key, as there can be hash collisions. > + */ > + uint32_t found_map; > + const struct cmap_node *nodes[NETDEV_MAX_BURST]; > + found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); > + > + /* Verify that packet actually matched rule. If not found, a hash > + * collision has taken place, so continue searching with the next node. > + */ > + ULLONG_FOR_EACH_1 (i, found_map) { > + struct dpcls_rule *rule; > + > + CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { > + const uint32_t cidx = i * 8; > + uint32_t match = netdev_rule_matches_key(rule, bit_count_total, > + &block_cache[cidx]); > + if (OVS_LIKELY(match)) { > + rules[i] = rule; > + subtable->hit_cnt++; > + goto next; > + } > + } > + > + /* None of the found rules was a match. Clear the i-th bit to > + * search for this key in the next subtable. */ > + ULLONG_SET0(found_map, i); > + next: > + ; /* Keep Sparse happy. */ > + } > + > + return found_map; > +} If someone is interested, the example below with the slides help understand the above function. diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c index 52348041bd00..f84a95423cf8 100644 --- a/lib/dpif-netdev-lookup-avx512-gather.c +++ b/lib/dpif-netdev-lookup-avx512-gather.c @@ -93,56 +93,77 @@ avx512_lookup_impl(struct dpcls_subtable *subtable, OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[NETDEV_MAX_BURST * 8]; - const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; - const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; - ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); - ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); + const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; //1000,0000 + const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; //0100,0000 + ovs_assert(__builtin_popcountll(tbl_u0) == bit_count_u0); //1 + ovs_assert(__builtin_popcountll(tbl_u1) == bit_count_u1); //1 + // bit_count_total = 2 /* Load subtable blocks for masking later */ - const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf); - const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]); + const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);//point to ipv4 mask + const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]); //porint to ipv4 mask /* Load pre-created subtable masks for each block in subtable */ - const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; - const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask, + const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; // (1 << 2) - 1 = 0x3 + const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask /* 0x3 */, subtable->mf_masks); + // subtable->mf_masks[0] = 0b01111111 + // subtable->mf_masks[1] = 0b00111111 + // v_mf_masks = [0,0,0,0,0,0, 0b00111111, 0b01111111] - ULLONG_FOR_EACH_1 (i, keys_map) { - const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; - const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); + ULLONG_FOR_EACH_1 (i, keys_map) {// for each packets in batch + const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; //0b1000,0100 + const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); //2 /* Pre-create register with *PER PACKET* u0 offset */ - const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); + const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); //(0xff << 1) = 0xfe const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask, pkt_mf_u0_pop); + //v_idx_u0_offset = [2,2,2,2,2,2,2,0] /* Broadcast u0, u1 bitmasks to 8x u64 lanes */ - __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits); - __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask, - keys[i]->mf.map.bits[1]); + __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);// [0b10000100,0b10000100,0b10000100, ...] + + __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask /*0xfe*/, + keys[i]->mf.map.bits[1] /* 0b01100000 */); + //0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000, 0b01100000,0b10000100 - /* Bitmask by pre-created masks */ + + /* Bitmask by pre-created masks. */ __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks); + // v_masks = [0,0,0,0,0,0, 0b00100000,0b00000100] /* Manual AVX512 popcount for u64 lanes */ __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks); + // v_popcnts = [0,0,0,0,0,0,1,1] /* Offset popcounts for u1 with pre-created offset register */ __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset); + // v_indexes = [0,0,0,0,0,0,3,1] /* Gather u64 on single packet, merge with zero reg, up to 8 blocks */ const __m512i v_zeros = _mm512_setzero_si512(); const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf); + // pkt_data = ipv4_src, ipv4_dst, mac_src, vlan_tci + __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros, - bit_count_total_mask, v_indexes, pkt_data, 8); + bit_count_total_mask /* 0x3 */, + v_indexes, pkt_data, 8); + //v_all_blocks: use v_index[0]=1*8 , v_index[1]=3*8 to gather data + //v_all_blocks = [0,0,0,0,0,0, ipv4_dst, vlan_tci] /* Zero out bits that pkt doesn't have: * - 2x pext() to extract bits from packet miniflow as needed by TBL * - Shift u1 over by bit_count of u0, OR to create zero bitmask */ - uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0); - uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1); + uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0] /* 0b1000,0100*/, + tbl_u0 /* 0b1000,0000 */); + // u0_to_zero = 0b00000001 + uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1] /* 0b0110, 0000*/, + tbl_u1 /* 0b0100,0000 */); + // u1_to_zero = 0b00000001 uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero; + // 0b00000011 /* Mask blocks using AND with subtable blocks, use k-mask to zero * where lanes as required for this packet. --- Pretty cool piece of code. Thanks! William _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
