Checking for each of the required AVX512 ISA separately will allow the
compiler to generate some AVX512 code where there is some support in the
compiler rather than only generating all AVX512 code when all of it is
supported or no AVX512 code at all.

For example, in GCC 4.9 where there is just support for AVX512F, this
patch will allow building the AVX512 DPIF.

Another example, in GCC 5 and 6, most AVX512 code can be generated, just
without AVX512VPOPCNTDQ support.

Signed-off-by: Cian Ferriter <cian.ferri...@intel.com>

---
v5:
* Create a selector function for the permutexvar implementations based
  on Sunil's feedback on the v4.  This hides the complexity of compile
  time and run time selection of permutexvar implementations.
* Add a comment explaining why VPOPCNTDQ_TARGET is defined and used.

v4:
* Combine the 3 commits which added checks for AVX512 ISA into this
  single commit since the first 2 commits were only useful and active
  when the 3rd commit was applied. This also takes care of Sunil's
  comment about explaining that the first 2 commits are precursors.
* Don't check for AVX512DQ availability in the compiler. This ISA isn't
  used in OVS.
* Put all AVX512 ISA checks in the OVS_CHECK_AVX512 macro as per Sunil's
  feedback.
* Define a function in acinclude.m4, (OVS_CONDITIONAL_CC_OPTION_DEFINE),
  to help with checking for AVX512 ISA support in the compiler.
* Remove the '__AVX512VPOPCNTDQ__' check. Use the HAVE_AVX512* pattern
  consistently with all AVX512 ISA checks instead. Fixup the comment
  explaining the _mm512_popcnt_epi64_wrapper() function to reflect this.

v3:
* Preserve the order of the mfex impl list. v2 changed this order. We
  want the order to be preserved because VBMI functions should be chosen
  by the mfex study impl where possible.

v2:
* Don't register vbmi specialized mfex impls unless VBMI is actually
  available.
  * This required some re-ordering of the mfex impl lists.
---
 acinclude.m4                           | 26 +++++++----
 lib/automake.mk                        | 14 ++++--
 lib/dpif-netdev-extract-avx512.c       | 64 ++++++++++++++++++--------
 lib/dpif-netdev-lookup-avx512-gather.c | 33 +++++++++----
 lib/dpif-netdev-lookup.c               |  3 +-
 lib/dpif-netdev-private-extract.c      | 18 ++++----
 lib/dpif-netdev-private-extract.h      | 19 +++++++-
 7 files changed, 127 insertions(+), 50 deletions(-)

diff --git a/acinclude.m4 b/acinclude.m4
index 61e88105f..7b2889a40 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -73,16 +73,13 @@ AC_DEFUN([OVS_CHECK_DPIF_AVX512_DEFAULT], [
 
 dnl OVS_CHECK_AVX512
 dnl
-dnl Checks if compiler and binutils supports AVX512.
+dnl Checks if compiler and binutils supports various AVX512 ISA.
 AC_DEFUN([OVS_CHECK_AVX512], [
   OVS_CHECK_BINUTILS_AVX512
-  OVS_CHECK_CC_OPTION(
-    [-mavx512f -mavx512vpopcntdq], [ovs_have_cc_mavx512f=yes], 
[ovs_have_cc_mavx512f=no])
-  AM_CONDITIONAL([HAVE_AVX512F], [test $ovs_have_cc_mavx512f = yes])
-  if test "$ovs_have_cc_mavx512f" = yes; then
-    AC_DEFINE([HAVE_AVX512F], [1],
-              [Define to 1 if compiler supports AVX512.])
-  fi
+  OVS_CONDITIONAL_CC_OPTION_DEFINE([-mavx512f], [HAVE_AVX512F])
+  OVS_CONDITIONAL_CC_OPTION_DEFINE([-mavx512bw], [HAVE_AVX512BW])
+  OVS_CONDITIONAL_CC_OPTION_DEFINE([-mavx512vbmi], [HAVE_AVX512VBMI])
+  OVS_CONDITIONAL_CC_OPTION_DEFINE([-mavx512vpopcntdq], [HAVE_AVX512VPOPCNTDQ])
 ])
 
 dnl OVS_ENABLE_WERROR
@@ -1360,6 +1357,19 @@ AC_DEFUN([OVS_CONDITIONAL_CC_OPTION],
    AM_CONDITIONAL([$2], [test $ovs_have_cc_option = yes])])
 dnl ----------------------------------------------------------------------
 
+dnl OVS_CONDITIONAL_CC_OPTION_DEFINE([OPTION], [CONDITIONAL])
+dnl Check whether the given C compiler OPTION is accepted.
+dnl If so, enable the given Automake CONDITIONAL and define it.
+dnl Example: OVS_CONDITIONAL_CC_OPTION_DEFINE([-mavx512f], [HAVE_AVX512F])
+AC_DEFUN([OVS_CONDITIONAL_CC_OPTION_DEFINE],
+  [OVS_CHECK_CC_OPTION(
+    [$1], [ovs_have_cc_option=yes], [ovs_have_cc_option=no])
+   AM_CONDITIONAL([$2], [test $ovs_have_cc_option = yes])
+   if test "$ovs_have_cc_option" = yes; then
+     AC_DEFINE([$2], [1],
+               [Define to 1 if compiler supports the '$1' option.])
+   fi])
+
 dnl Check for too-old XenServer.
 AC_DEFUN([OVS_CHECK_XENSERVER_VERSION],
   [AC_CACHE_CHECK([XenServer release], [ovs_cv_xsversion],
diff --git a/lib/automake.mk b/lib/automake.mk
index 14347bac6..cb50578eb 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -31,7 +31,6 @@ lib_LTLIBRARIES += lib/libopenvswitchavx512.la
 lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
 lib_libopenvswitchavx512_la_CFLAGS = \
        -mavx512f \
-       -mavx512bw \
        -mbmi \
        -mbmi2 \
        -fPIC \
@@ -39,13 +38,18 @@ lib_libopenvswitchavx512_la_CFLAGS = \
 lib_libopenvswitchavx512_la_SOURCES = \
        lib/cpu.c \
        lib/cpu.h \
-       lib/dpif-netdev-lookup-avx512-gather.c \
-       lib/dpif-netdev-extract-avx512.c \
        lib/dpif-netdev-avx512.c
+if HAVE_AVX512BW
+lib_libopenvswitchavx512_la_CFLAGS += \
+       -mavx512bw
+lib_libopenvswitchavx512_la_SOURCES += \
+       lib/dpif-netdev-extract-avx512.c \
+       lib/dpif-netdev-lookup-avx512-gather.c
+endif # HAVE_AVX512BW
 lib_libopenvswitchavx512_la_LDFLAGS = \
        -static
-endif
-endif
+endif # HAVE_LD_AVX512_GOOD
+endif # HAVE_AVX512F
 
 # Build core vswitch libraries as before
 lib_libopenvswitch_la_SOURCES = \
diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index 4a94dfcfd..12271be17 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -108,13 +108,42 @@ _mm512_maskz_permutex2var_epi8_skx(__mmask64 k_mask,
     return v_result_kmskd;
 }
 
-/* Wrapper function required to enable ISA. */
+/* Wrapper function to enable VBMI ISA required by the
+ * _mm512_maskz_permutexvar_epi8 intrinsic. */
+#if HAVE_AVX512VBMI
 static inline __m512i
 __attribute__((__target__("avx512vbmi")))
 _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i idx, __m512i a)
 {
     return _mm512_maskz_permutexvar_epi8(kmask, idx, a);
 }
+#endif
+
+static inline __m512i
+_mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf,
+                                       __m512i v_pkt0,
+                                       const uint32_t use_vbmi OVS_UNUSED)
+{
+    /* Permute the packet layout into miniflow blocks shape. */
+    __m512i v512_zeros = _mm512_setzero_si512();
+    __m512i v_blk0;
+#if HAVE_AVX512VBMI
+    if (__builtin_constant_p(use_vbmi) && use_vbmi) {
+        /* As different AVX512 ISA levels have different implementations,
+        * this specializes on the use_vbmi attribute passed in.
+        */
+        v_blk0 = _mm512_maskz_permutexvar_epi8_wrap(k_shuf, v_shuf, v_pkt0);
+
+    } else {
+        v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0, v_shuf,
+                                                    v512_zeros);
+    }
+#else
+    v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0, v_shuf,
+                                                v512_zeros);
+#endif
+    return v_blk0;
+}
 
 
 /* This file contains optimized implementations of miniflow_extract()
@@ -481,7 +510,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
                     odp_port_t in_port,
                     void *pmd_handle OVS_UNUSED,
                     const enum MFEX_PROFILES profile_id,
-                    const uint32_t use_vbmi)
+                    const uint32_t use_vbmi OVS_UNUSED)
 {
     uint32_t hitmask = 0;
     struct dp_packet *packet;
@@ -538,19 +567,9 @@ mfex_avx512_process(struct dp_packet_batch *packets,
         _mm_storeu_si128((void *) bits, v_bits);
         _mm_storeu_si128((void *) blocks, v_blocks01);
 
-        /* Permute the packet layout into miniflow blocks shape.
-         * As different AVX512 ISA levels have different implementations,
-         * this specializes on the "use_vbmi" attribute passed in.
-         */
-        __m512i v512_zeros = _mm512_setzero_si512();
-        __m512i v_blk0;
-        if (__builtin_constant_p(use_vbmi) && use_vbmi) {
-            v_blk0 = _mm512_maskz_permutexvar_epi8_wrap(k_shuf, v_shuf,
-                                                        v_pkt0);
-        } else {
-            v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0,
-                                                        v_shuf, v512_zeros);
-        }
+        __m512i v_blk0 = _mm512_maskz_permutexvar_epi8_selector(k_shuf, v_shuf,
+                                                                v_pkt0,
+                                                                use_vbmi);
 
         __m512i v_blk0_strip = _mm512_and_si512(v_blk0, v_strp);
         _mm512_storeu_si512(&blocks[2], v_blk0_strip);
@@ -629,7 +648,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 }
 
 
-#define DECLARE_MFEX_FUNC(name, profile)                                \
+#if HAVE_AVX512VBMI
+#define VBMI_MFEX_FUNC(name, profile)                                   \
 uint32_t                                                                \
 __attribute__((__target__("avx512vbmi")))                               \
 mfex_avx512_vbmi_##name(struct dp_packet_batch *packets,                \
@@ -639,8 +659,12 @@ mfex_avx512_vbmi_##name(struct dp_packet_batch *packets,   
             \
 {                                                                       \
     return mfex_avx512_process(packets, keys, keys_size, in_port,       \
                                pmd_handle, profile, 1);                 \
-}                                                                       \
-                                                                        \
+}
+#else
+#define VBMI_MFEX_FUNC(name, profile)
+#endif
+
+#define BASIC_MFEX_FUNC(name, profile)                                  \
 uint32_t                                                                \
 mfex_avx512_##name(struct dp_packet_batch *packets,                     \
                    struct netdev_flow_key *keys, uint32_t keys_size,    \
@@ -651,6 +675,10 @@ mfex_avx512_##name(struct dp_packet_batch *packets,        
             \
                                pmd_handle, profile, 0);                 \
 }
 
+#define DECLARE_MFEX_FUNC(name, profile)                                \
+VBMI_MFEX_FUNC(name, profile)                                           \
+BASIC_MFEX_FUNC(name, profile)                                          \
+
 /* Each profile gets a single declare here, which specializes the function
  * as required.
  */
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index b396772bc..1e86be207 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -78,22 +78,26 @@ _mm512_popcnt_epi64_manual(__m512i v_in)
     return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
 }
 
-/* Wrapper function required to enable ISA. First enable the ISA via the
- * attribute target for this function, then check if the compiler actually
- * #defines the ISA itself. If the ISA is not #define-ed by the compiler it
- * indicates the compiler is too old or is not capable of compiling the
- * requested ISA level, so fallback to the integer manual implementation.
+/* Wrapper function required to enable ISA. First check if the compiler
+ * supports the ISA itself. If the ISA is supported, enable it via the
+ * attribute target.  If the ISA is not supported by the compiler it indicates
+ * the compiler is too old or is not capable of compiling the requested ISA
+ * level, so fallback to the integer manual implementation.
  */
+#if HAVE_AVX512VPOPCNTDQ
 static inline __m512i
 __attribute__((__target__("avx512vpopcntdq")))
 _mm512_popcnt_epi64_wrapper(__m512i v_in)
 {
-#ifdef __AVX512VPOPCNTDQ__
     return _mm512_popcnt_epi64(v_in);
+}
 #else
+static inline __m512i
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
     return _mm512_popcnt_epi64_manual(v_in);
-#endif
 }
+#endif
 
 static inline uint64_t
 netdev_rule_matches_key(const struct dpcls_rule *rule,
@@ -334,6 +338,19 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
     return found_map;
 }
 
+/* Use a different pattern to conditionally use the VPOPCNTDQ target attribute
+ * here.
+ * The usual pattern using a '#if HAVE_AVX512VPOPCNTDQ' type check won't work
+ * inside a macro.
+ * Define VPOPCNTDQ_TARGET which will either be the "avx512vpopcntdq" target
+ * attribute or nothing depending on AVX512VPOPCNTDQ support in the compiler.
+ */
+#if HAVE_AVX512VPOPCNTDQ
+#define VPOPCNTDQ_TARGET __attribute__((__target__("avx512vpopcntdq")))
+#else
+#define VPOPCNTDQ_TARGET
+#endif
+
 /* Expand out specialized functions with U0 and U1 bit attributes. As the
  * AVX512 vpopcnt instruction is not supported on all AVX512 capable CPUs,
  * create two functions for each miniflow signature. This allows the runtime
@@ -351,7 +368,7 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
                                   U0, U1, use_vpop);                          \
     }                                                                         \
                                                                               \
-    static uint32_t __attribute__((__target__("avx512vpopcntdq")))            \
+    static uint32_t VPOPCNTDQ_TARGET                                          \
     dpcls_avx512_gather_mf_##U0##_##U1##_vpop(struct dpcls_subtable *subtable,\
                                        uint32_t keys_map,                     \
                                        const struct netdev_flow_key *keys[],  \
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index bd0a99abe..c6aab6aed 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -43,7 +43,8 @@ static struct dpcls_subtable_lookup_info_t subtable_lookups[] 
= {
       .probe = dpcls_subtable_generic_probe,
       .name = "generic", },
 
-#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && HAVE_AVX512BW \
+     && __SSE4_2__)
     /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */
     { .prio = 0,
       .probe = dpcls_subtable_avx512_gather_probe,
diff --git a/lib/dpif-netdev-private-extract.c 
b/lib/dpif-netdev-private-extract.c
index b7f094dac..9ce4e0909 100644
--- a/lib/dpif-netdev-private-extract.c
+++ b/lib/dpif-netdev-private-extract.c
@@ -54,42 +54,44 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = {
         .name = "study", },
 
 /* Compile in implementations only if the compiler ISA checks pass. */
-#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && HAVE_AVX512BW \
+     && __SSE4_2__)
+#if HAVE_AVX512VBMI
     [MFEX_IMPL_VBMI_IPv4_UDP] = {
         .probe = mfex_avx512_vbmi_probe,
         .extract_func = mfex_avx512_vbmi_ip_udp,
         .name = "avx512_vbmi_ipv4_udp", },
-
+#endif
     [MFEX_IMPL_IPv4_UDP] = {
         .probe = mfex_avx512_probe,
         .extract_func = mfex_avx512_ip_udp,
         .name = "avx512_ipv4_udp", },
-
+#if HAVE_AVX512VBMI
     [MFEX_IMPL_VBMI_IPv4_TCP] = {
         .probe = mfex_avx512_vbmi_probe,
         .extract_func = mfex_avx512_vbmi_ip_tcp,
         .name = "avx512_vbmi_ipv4_tcp", },
-
+#endif
     [MFEX_IMPL_IPv4_TCP] = {
         .probe = mfex_avx512_probe,
         .extract_func = mfex_avx512_ip_tcp,
         .name = "avx512_ipv4_tcp", },
-
+#if HAVE_AVX512VBMI
     [MFEX_IMPL_VBMI_DOT1Q_IPv4_UDP] = {
         .probe = mfex_avx512_vbmi_probe,
         .extract_func = mfex_avx512_vbmi_dot1q_ip_udp,
         .name = "avx512_vbmi_dot1q_ipv4_udp", },
-
+#endif
     [MFEX_IMPL_DOT1Q_IPv4_UDP] = {
         .probe = mfex_avx512_probe,
         .extract_func = mfex_avx512_dot1q_ip_udp,
         .name = "avx512_dot1q_ipv4_udp", },
-
+#if HAVE_AVX512VBMI
     [MFEX_IMPL_VBMI_DOT1Q_IPv4_TCP] = {
         .probe = mfex_avx512_vbmi_probe,
         .extract_func = mfex_avx512_vbmi_dot1q_ip_tcp,
         .name = "avx512_vbmi_dot1q_ipv4_tcp", },
-
+#endif
     [MFEX_IMPL_DOT1Q_IPv4_TCP] = {
         .probe = mfex_avx512_probe,
         .extract_func = mfex_avx512_dot1q_ip_tcp,
diff --git a/lib/dpif-netdev-private-extract.h 
b/lib/dpif-netdev-private-extract.h
index ae5c161b4..092126106 100644
--- a/lib/dpif-netdev-private-extract.h
+++ b/lib/dpif-netdev-private-extract.h
@@ -81,14 +81,23 @@ enum dpif_miniflow_extract_impl_idx {
     MFEX_IMPL_AUTOVALIDATOR,
     MFEX_IMPL_SCALAR,
     MFEX_IMPL_STUDY,
-#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && HAVE_AVX512BW \
+     && __SSE4_2__)
+#if HAVE_AVX512VBMI
     MFEX_IMPL_VBMI_IPv4_UDP,
+#endif
     MFEX_IMPL_IPv4_UDP,
+#if HAVE_AVX512VBMI
     MFEX_IMPL_VBMI_IPv4_TCP,
+#endif
     MFEX_IMPL_IPv4_TCP,
+#if HAVE_AVX512VBMI
     MFEX_IMPL_VBMI_DOT1Q_IPv4_UDP,
+#endif
     MFEX_IMPL_DOT1Q_IPv4_UDP,
+#if HAVE_AVX512VBMI
     MFEX_IMPL_VBMI_DOT1Q_IPv4_TCP,
+#endif
     MFEX_IMPL_DOT1Q_IPv4_TCP,
 #endif
     MFEX_IMPL_MAX
@@ -99,9 +108,15 @@ extern struct ovs_mutex dp_netdev_mutex;
 /* Define a index which points to the first traffic optimized MFEX
  * option from the enum list else holds max value.
  */
-#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && HAVE_AVX512BW \
+     && __SSE4_2__)
 
+#if HAVE_AVX512VBMI
 #define MFEX_IMPL_START_IDX MFEX_IMPL_VBMI_IPv4_UDP
+#else
+#define MFEX_IMPL_START_IDX MFEX_IMPL_IPv4_UDP
+#endif
+
 #else
 
 #define MFEX_IMPL_START_IDX MFEX_IMPL_MAX
-- 
2.25.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to