from:"Harry van Haaren"

[ovs-dev] [PATCH v10 09/10] odp-execute: Add ISA implementation of set_masked ETH

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit includes infrastructure changes for enabling set_masked_X
actions and also adds support for the AVX512 implementation of the
eth_set_addrs action.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c  | 90 +++
 lib/odp-execute-private.c | 14 ++
 lib/odp-execute-private.h |  3 ++
 lib/odp-execute.c | 49 +++--
 lib/odp-execute.h |  3 ++
 5 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 3449acff7..8ecdaecf6 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -23,6 +23,7 @@
 
 #include "dp-packet.h"
 #include "immintrin.h"
+#include "odp-execute.h"
 #include "odp-execute-private.h"
 #include "odp-netlink.h"
 #include "openvswitch/vlog.h"
@@ -50,6 +51,16 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
 BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
   offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
 
+/* The below build assert makes sure the order of the fields needed by
+ * the set masked functions shuffle operations do not change. This should not
+ * happen as these are defined under the Linux uapi. */
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ethernet, eth_src) +
+  MEMBER_SIZEOF(struct ovs_key_ethernet, eth_src) ==
+  offsetof(struct ovs_key_ethernet, eth_dst));
+
+/* Array of callback functions, one for each masked operation. */
+odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
+
 static inline void ALWAYS_INLINE
 avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
 {
@@ -207,6 +218,80 @@ action_avx512_push_vlan(struct dp_packet_batch *batch, 
const struct nlattr *a)
 }
 }
 
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_eth_set_addrs() function. */
+static void
+action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
+const struct nlattr *a)
+{
+const struct ovs_key_ethernet *key, *mask;
+struct dp_packet *packet;
+
+a = nl_attr_get(a);
+key = nl_attr_get(a);
+mask = odp_get_key_mask(a, struct ovs_key_ethernet);
+
+/* Read the content of the key(src) and mask in the respective registers.
+ * We only load the src and dest addresses, which is only 96-bits and not
+ * 128-bits. */
+__m128i v_src = _mm_maskz_loadu_epi32(0x7,(void *) key);
+__m128i v_mask = _mm_maskz_loadu_epi32(0x7, (void *) mask);
+
+
+/* These shuffle masks are used below, and each position tells where to
+ * move the bytes to. So here, the fourth sixth byte in
+ * ovs_key_ethernet is moved to byte location 0 in v_src/v_mask.
+ * The seventh is moved to 1, etc., etc.
+ * This swap is needed to move the src and dest MAC addresses in the
+ * same order as in the ethernet packet. */
+static const uint8_t eth_shuffle[16] = {
+6, 7, 8, 9, 10, 11, 0, 1,
+2, 3, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* Load the shuffle mask in v_shuf. */
+__m128i v_shuf = _mm_loadu_si128((void *) eth_shuffle);
+
+/* Swap the key/mask src and dest addresses to the ethernet order. */
+v_src = _mm_shuffle_epi8(v_src, v_shuf);
+v_mask = _mm_shuffle_epi8(v_mask, v_shuf);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+
+struct eth_header *eh = dp_packet_eth(packet);
+
+if (!eh) {
+continue;
+}
+
+/* Load the first 128-bits of the packet into the v_ether register. */
+__m128i v_dst = _mm_loadu_si128((void *) eh);
+
+/* AND the v_mask to the packet data (v_dst). */
+__m128i dst_masked = _mm_andnot_si128(v_mask, v_dst);
+
+/* OR the new addresses (v_src) with the masked packet addresses
+ * (dst_masked). */
+__m128i res = _mm_or_si128(v_src, dst_masked);
+
+/* Write back the modified ethernet addresses. */
+_mm_storeu_si128((void *) eh, res);
+}
+}
+
+static void
+action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+const struct nlattr *mask = nl_attr_get(a);
+enum ovs_key_attr attr_type = nl_attr_type(mask);
+
+if (attr_type <= OVS_KEY_ATTR_MAX && impl_set_masked_funcs[attr_type]) {
+impl_set_masked_funcs[attr_type](batch, a);
+} else {
+odp_execute_scalar_action(batch, a);
+}
+}
+
 int
 action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
 {
@@ -214,6 +299,11 @@ action_avx512_init(struct odp_execute_action_impl *self 
OVS_UNUSED)
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
 self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_avx512_push_vlan;
+self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
+
+/* Set function pointers for the individual operations supported by the
+ * SET_MASKED action.

[ovs-dev] [PATCH v10 10/10] odp-execute: Add ISA implementation of set_masked IPv4 action

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c | 208 +++
 1 file changed, 208 insertions(+)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 8ecdaecf6..a0c97f312 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 
+#include "csum.h"
 #include "dp-packet.h"
 #include "immintrin.h"
 #include "odp-execute.h"
@@ -58,6 +59,22 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ethernet, eth_src) 
+
   MEMBER_SIZEOF(struct ovs_key_ethernet, eth_src) ==
   offsetof(struct ovs_key_ethernet, eth_dst));
 
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
 /* Array of callback functions, one for each masked operation. */
 odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
 
@@ -279,6 +296,196 @@ action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
 }
 }
 
+static inline uint16_t ALWAYS_INLINE
+avx512_get_delta(__m256i old_header, __m256i res)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+ * old and new header to add padding after each 16-bit value for the
+ * following carry over addition. */
+__m256i v_swap16a = _mm256_setr_epi16(0x0100, 0x, 0x0302, 0x,
+  0x0504, 0x, 0x0706, 0x,
+  0x0100, 0x, 0x0302, 0x,
+  0x, 0x, 0x, 0x);
+__m256i v_swap16b = _mm256_setr_epi16(0x0908, 0x, 0x0B0A, 0x,
+  0x0D0C, 0x, 0x0F0E, 0x,
+  0x, 0x, 0x, 0x,
+  0x, 0x, 0x, 0x);
+__m256i v_shuf_old1 = _mm256_shuffle_epi8(old_header, v_swap16a);
+__m256i v_shuf_old2 = _mm256_shuffle_epi8(old_header, v_swap16b);
+__m256i v_shuf_new1 = _mm256_shuffle_epi8(res, v_swap16a);
+__m256i v_shuf_new2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+/* Add each part of the old and new headers together. */
+__m256i v_delta1 = _mm256_add_epi32(v_shuf_old1, v_shuf_new1);
+__m256i v_delta2 = _mm256_add_epi32(v_shuf_old2, v_shuf_new2);
+
+/* Add old and new header. */
+__m256i v_delta = _mm256_add_epi32(v_delta1, v_delta2);
+
+/* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+/* Shuffle 32-bit value from 3rd lane into first lane for final
+ * horizontal add. */
+__m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+  0xF, 0xF, 0xF, 0xF);
+v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+/* Extract delta value. */
+delta = _mm256_extract_epi16(v_delta, 0);
+
+return delta;
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(__m256i old_header, __m256i res)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/* Set the v_ones register to all one's. */
+__m256i v_ones = _mm256_cmpeq_epi16(v_zeros, v_zeros);
+
+/* Combine the old and new header, i.e. adding in the new IP addresses
+ * in the old header (oh). This is done by using the 0x03C 16-bit mask,
+ * picking 16-bit word 7 till 10.  */
+__m256i v_blend_new = _mm256_mask_blend_epi16(0x03C0, old_header, res);
+
+/* Invert the old_header register. */
+old_header =_mm256_andnot_si256(old_header, v_ones);
+
+/* Calculate the delta between the old and new header. */
+delta = avx512_get_delta(old_header, v_blend_new);
+
+return delta;
+
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_update_csum(__m256i res, __m256i old_header)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/*

[ovs-dev] [PATCH v10 08/10] odp-execute: Add ISA implementation of push_vlan action.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit adds the AVX512 implementation of the
push_vlan action.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c | 55 
 lib/odp-execute.c| 22 +---
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index fd10f7f5c..3449acff7 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -154,12 +154,67 @@ action_avx512_pop_vlan(struct dp_packet_batch *batch,
 }
 }
 
+/* This function performs the same operation on each packet in the batch as
+ * the scalar eth_push_vlan() function. */
+static void
+action_avx512_push_vlan(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct dp_packet *packet;
+const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+ovs_be16 tpid, tci;
+
+/* This shuffle mask is used below, and each position tells where to
+ * move the bytes to. So here, the fourth byte in v_ether is moved to
+ * byte location 0 in v_shift. The fifth is moved to 1, etc., etc.
+ * The 0xFF is special it tells to fill that position with 0.
+ */
+static const uint8_t vlan_push_shuffle_mask[16] = {
+4, 5, 6, 7, 8, 9, 10, 11,
+12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* Load the shuffle mask in v_index. */
+__m128i v_index = _mm_loadu_si128((void *) vlan_push_shuffle_mask);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+tpid = vlan->vlan_tpid;
+tci = vlan->vlan_tci;
+
+/* As we are about to insert the VLAN_HEADER we now need to adjust all
+ * the offsets. */
+avx512_dp_packet_resize_l2(packet, VLAN_HEADER_LEN);
+
+char *pkt_data = (char *) dp_packet_data(packet);
+
+/* Build up the VLAN TCI/TPID in a single uint32_t. */
+const uint32_t tci_proc = tci & htons(~VLAN_CFI);
+const uint32_t tpid_tci = (tci_proc << 16) | tpid;
+
+/* Load the first 128-bits of the packet into the v_ether register.
+ * Note that this includes the 4 unused bytes (VLAN_HEADER_LEN). */
+__m128i v_ether = _mm_loadu_si128((void *) pkt_data);
+
+/* Move(shuffle) the veth_dst and veth_src data to create room for
+ * the vlan header. */
+__m128i v_shift = _mm_shuffle_epi8(v_ether, v_index);
+
+/* Copy(insert) the 32-bit VLAN header, tpid_tci, at the 3rd 32-bit
+ * word offset, i.e. ofssetof(vlan_eth_header, veth_type) */
+__m128i v_vlan_hdr = _mm_insert_epi32(v_shift, tpid_tci, 3);
+
+/* Write back the modified ethernet header. */
+_mm_storeu_si128((void *) pkt_data, v_vlan_hdr);
+}
+}
+
 int
 action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
 {
 /* Set function pointers for actions that can be applied directly, these
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
+self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_avx512_push_vlan;
+
 return 0;
 }
 
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index f112f3b48..0c5837640 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -846,6 +846,17 @@ action_pop_vlan(struct dp_packet_batch *batch,
 }
 }
 
+static void
+action_push_vlan(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct dp_packet *packet;
+const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
+}
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -855,6 +866,7 @@ odp_action_scalar_init(struct odp_execute_action_impl *self)
 /* Set function pointers for actions that can be applied directly, these
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_pop_vlan;
+self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
 
 return 0;
 }
@@ -1045,15 +1057,6 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 break;
 }
 
-case OVS_ACTION_ATTR_PUSH_VLAN: {
-const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
-
-DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
-}
-break;
-}
-
 case OVS_ACTION_ATTR_PUSH_MPLS: {
 const struct ovs_action_push_mpls *mpls = nl_attr_get(a);
 
@@ -1206,6 +1209,7 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 case __OVS_ACTION_ATTR_MAX:
 /* The following actions are handled by the scalar implementation. */
 case OVS_ACTION_ATTR_POP_VLAN:
+case OVS_ACTION_ATTR_PUSH_VLAN:
 OVS_NOT_REACHED();
 }
 
-- 
2.32.0

[ovs-dev] [PATCH v10 07/10] odp-execute: Add ISA implementation of pop_vlan action.

2022-07-13 Thread Harry van Haaren

This commit adds the AVX512 implementation of the
pop_vlan action.

Signed-off-by: Emma Finn 

---

v10:
- Improved ISA checks to fix CI build
---
 lib/automake.mk   |   4 +
 lib/odp-execute-avx512.c  | 182 ++
 lib/odp-execute-private.c |  33 ++-
 lib/odp-execute-private.h |   2 +
 4 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 lib/odp-execute-avx512.c

diff --git a/lib/automake.mk b/lib/automake.mk
index 5c3b05f6b..a76de6dbf 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -45,6 +45,10 @@ lib_libopenvswitchavx512_la_CFLAGS += \
 lib_libopenvswitchavx512_la_SOURCES += \
lib/dpif-netdev-extract-avx512.c \
lib/dpif-netdev-lookup-avx512-gather.c
+if HAVE_GCC_AVX512VL_GOOD
+lib_libopenvswitchavx512_la_SOURCES += \
+   lib/odp-execute-avx512.c
+endif # HAVE_GCC_AVX512VL_GOOD
 endif # HAVE_AVX512VL
 endif # HAVE_AVX512BW
 lib_libopenvswitchavx512_la_LDFLAGS = \
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
new file mode 100644
index 0..fd10f7f5c
--- /dev/null
+++ b/lib/odp-execute-avx512.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+#include 
+#include 
+
+#include "dp-packet.h"
+#include "immintrin.h"
+#include "odp-execute-private.h"
+#include "odp-netlink.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+
+/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs
+ * fields remain in the same order and offset to l2_padd_size. This is needed
+ * as the avx512_dp_packet_resize_l2() function will manipulate those fields at
+ * a fixed memory index based on the l2_padd_size offset. */
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) +
+  MEMBER_SIZEOF(struct dp_packet, l2_pad_size) ==
+  offsetof(struct dp_packet, l2_5_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
+  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
+  offsetof(struct dp_packet, l3_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
+   MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
+   offsetof(struct dp_packet, l4_ofs));
+
+/* The below build assert makes sure it's safe to read/write 128-bits starting
+ * at the l2_pad_size location. */
+BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
+  offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
+
+static inline void ALWAYS_INLINE
+avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
+{
+/* Update packet size/data pointers, same as the scalar implementation. */
+if (resize_by_bytes >= 0) {
+dp_packet_push_uninit(b, resize_by_bytes);
+} else {
+dp_packet_pull(b, -resize_by_bytes);
+}
+
+/* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which
+ * the scalar implementation does with the  dp_packet_adjust_layer_offset()
+ * function. */
+
+/* Set the v_zero register to all zero's. */
+const __m128i v_zeros = _mm_setzero_si128();
+
+/* Set the v_u16_max register to all one's. */
+const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+/* Each lane represents 16 bits in a 12-bit register. In this case the
+ * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and
+ * l4_ofs fields. */
+const uint8_t k_lanes = 0b1110;
+
+/* Set all 16-bit words in the 128-bits v_offset register to the value we
+ * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */
+__m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));
+
+/* Load 128 bits from the dp_packet structure starting at the l2_pad_size
+ * offset. */
+void *adjust_ptr = >l2_pad_size;
+__m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
+
+/* Here is the tricky part, we only need to update the value of the three
+ * fields if they are not UINT16_MAX. The following function will return
+ * a mask of lanes (read fields) that are not UINT16_MAX. It will do this
+ * by comparing only the lanes we requested, k_lanes, and if they match
+ * v_u16_max, the bit will be set. */
+__mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
+

[ovs-dev] [PATCH v10 06/10] odp-execute: Add ISA implementation of actions.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit adds the AVX512 implementation of the action functionality.

Usage:
  $ ovs-appctl odp-execute/action-impl-set avx512

Signed-off-by: Emma Finn 
Signed-off-by: Harry van Haaren 

---

v10:
- Move location and rework documentation (Ilya)
- Improve ISA checks to fix CI build issue

v9: rebase conflict on NEWS
---
 Documentation/topics/dpdk/bridge.rst | 30 
 Documentation/topics/testing.rst | 24 ++
 NEWS |  1 +
 acinclude.m4 |  1 +
 lib/cpu.c|  1 +
 lib/cpu.h|  1 +
 lib/odp-execute-private.c|  8 
 lib/odp-execute-private.h| 12 +++
 m4/openvswitch.m4| 29 +++
 9 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 1f626c7c2..354f1ced1 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -321,3 +321,33 @@ following command::
 ``scalar`` can be selected on core ``3`` by the following command::
 
 $ ovs-appctl dpif-netdev/miniflow-parser-set -pmd 3 scalar
+
+
+Actions Implementations (Experimental)
+--
+
+Actions describe what processing or modification should be performed on a
+packet when it matches a given flow. Similar to the datapath interface,
+DPCLS and MFEX (see above), the implementation of these actions can be
+accelerated using SIMD instructions, resulting in improved performance.
+
+OVS provides multiple implementations of the actions, however some
+implementations requiring a CPU capable of executing the required SIMD
+instructions.
+
+Available implementations can be listed with the following command::
+
+$ ovs-appctl odp-execute/action-impl-show
+Available Actions implementations:
+scalar (available: Yes, active: Yes)
+autovalidator (available: Yes, active: No)
+avx512 (available: Yes, active: No)
+
+By default, ``scalar`` is used.  Implementations can be selected by
+name::
+
+$ ovs-appctl odp-execute/action-impl-set avx512
+Action implementation set to avx512.
+
+$ ovs-appctl odp-execute/action-impl-set scalar
+Action implementation set to scalar.
diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst
index c15d5b38f..a6c747b18 100644
--- a/Documentation/topics/testing.rst
+++ b/Documentation/topics/testing.rst
@@ -361,12 +361,12 @@ testsuite.
 Userspace datapath: Testing and Validation of CPU-specific Optimizations
 
 
-As multiple versions of the datapath classifier and packet parsing functions
-can co-exist, each with different CPU ISA optimizations, it is important to
-validate that they all give the exact same results.  To easily test all the
-implementations, an ``autovalidator`` implementation of them exists.  This
-implementation runs all other available implementations, and verifies that the
-results are identical.
+As multiple versions of the datapath classifier, packet parsing functions and
+actions can co-exist, each with different CPU ISA optimizations, it is
+important to validate that they all give the exact same results.  To easily
+test all the implementations, an ``autovalidator`` implementation of them
+exists. This implementation runs all other available implementations, and
+verifies that the results are identical.
 
 Running the OVS unit tests with the autovalidator enabled ensures all
 implementations provide the same results.  Note that the performance of the
@@ -382,18 +382,26 @@ To set the autovalidator for the packet parser, use this 
command::
 
 $ ovs-appctl dpif-netdev/miniflow-parser-set autovalidator
 
+To set the autovalidator for actions, use this command::
+
+$ ovs-appctl odp-execute/action-impl-set autovalidator
+
 To run the OVS unit test suite with the autovalidator as the default
 implementation, it is required to recompile OVS.  During the recompilation,
 the default priority of the `autovalidator` implementation is set to the
-maximum priority, ensuring every test will be run with every implementation::
+maximum priority, ensuring every test will be run with every implementation.
+Priority is only related to mfex autovalidator and not the actions
+autovalidator.::
 
-$ ./configure --enable-autovalidator --enable-mfex-default-autovalidator
+$ ./configure --enable-autovalidator --enable-mfex-default-autovalidator \
+--enable-actions-default-autovalidator
 
 The following line should be seen in the configuration log when the above
 options are used::
 
 checking whether DPCLS Autovalidator is default implementation... yes
 checking whether MFEX Autovalidator is default implementation... yes
+checking whether actions

[ovs-dev] [PATCH v10 05/10] acinclude: Add configure option to enable actions autovalidator at build time.

2022-07-13 Thread Harry van Haaren

From: Kumar Amber 

This commit adds a new command to allow the user to enable the
actions autovalidator by default at build time thus allowing for
running unit test by default.

 $ ./configure --enable-actions-default-autovalidator

Signed-off-by: Kumar Amber 
Acked-by: Harry van Haaren 

---

v9:
- rebase conflict on NEWS
- fixup missing "dnl" in comment introduced by previous line-wrapping
---
 NEWS  |  2 ++
 acinclude.m4  | 20 
 configure.ac  |  1 +
 lib/odp-execute.c |  4 
 4 files changed, 27 insertions(+)

diff --git a/NEWS b/NEWS
index d02733936..bb03457f6 100644
--- a/NEWS
+++ b/NEWS
@@ -54,6 +54,8 @@ Post-v2.17.0
implementations against default implementation.
  * Add command line option to switch between different actions
implementations available at run time.
+ * Add build time configure command to enable auto-validator as default
+   actions implementation at build time.
- Linux datapath:
  * Add offloading meter tc police.
  * Add support for offloading the check_pkt_len action.
diff --git a/acinclude.m4 b/acinclude.m4
index d15f11a4e..84fad425c 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -14,6 +14,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+dnl Set OVS Actions Autovalidator as the default action implementation
+dnl at compile time This enables automatically running all unit tests
+dnl with all actions implementations.
+AC_DEFUN([OVS_CHECK_ACTIONS_AUTOVALIDATOR], [
+  AC_ARG_ENABLE([actions-default-autovalidator],
+[AC_HELP_STRING([--enable-actions-default-autovalidator],
+[Enable actions autovalidator as default
+ ovs actions implementation.])],
+[autovalidator=yes],[autovalidator=no])
+  AC_MSG_CHECKING([whether actions Autovalidator is default implementation])
+  if test "$autovalidator" != yes; then
+AC_MSG_RESULT([no])
+  else
+AC_DEFINE([ACTIONS_AUTOVALIDATOR_DEFAULT], [1],
+  [Autovalidator for actions is a default implementation.])
+AC_MSG_RESULT([yes])
+  fi
+])
+
+
 dnl Set OVS MFEX Autovalidator as default miniflow extract at compile time?
 dnl This enables automatically running all unit tests with all MFEX
 dnl implementations.
diff --git a/configure.ac b/configure.ac
index 6f8679d7c..6c51e48ce 100644
--- a/configure.ac
+++ b/configure.ac
@@ -184,6 +184,7 @@ OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], 
[HAVE_WNO_UNUSED_PARAMETER])
 OVS_ENABLE_WERROR_TOP
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
+OVS_CHECK_ACTIONS_AUTOVALIDATOR
 OVS_CHECK_DPCLS_AUTOVALIDATOR
 OVS_CHECK_DPIF_AVX512_DEFAULT
 OVS_CHECK_MFEX_AUTOVALIDATOR
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index d5be190e0..f112f3b48 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -928,7 +928,11 @@ odp_execute_init(void)
 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
 if (ovsthread_once_start()) {
 odp_execute_action_init();
+#ifdef ACTIONS_AUTOVALIDATOR_DEFAULT
+odp_actions_impl_set("autovalidator");
+#else
 odp_actions_impl_set("scalar");
+#endif
 odp_execute_unixctl_init();
 ovsthread_once_done();
 }
-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v10 04/10] odp-execute: Add command to switch action implementation.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit adds a new command to allow the user to switch
the active action implementation at runtime.

Usage:
  $ ovs-appctl odp-execute/action-impl-set scalar

This commit also adds a new command to retrieve the list of available
action implementations. This can be used by to check what implementations
of actions are available and what implementation is active during runtime.

Usage:
   $ ovs-appctl odp-execute/action-impl-show

Added separate test-case for ovs-actions show/set commands:
PMD - ovs-actions configuration

Signed-off-by: Emma Finn 
Signed-off-by: Kumar Amber 
Signed-off-by: Sunil Pai G 
Co-authored-by: Kumar Amber 
Co-authored-by: Sunil Pai G 
Acked-by: Harry van Haaren 

---

v9: rebase conflict on NEWS
---
 NEWS|  2 ++
 lib/automake.mk |  1 +
 lib/odp-execute-private.c   | 12 ++
 lib/odp-execute-private.h   |  2 ++
 lib/odp-execute-unixctl.man | 10 +
 lib/odp-execute.c   | 44 +
 tests/pmd.at| 39 
 vswitchd/ovs-vswitchd.8.in  |  1 +
 8 files changed, 111 insertions(+)
 create mode 100644 lib/odp-execute-unixctl.man

diff --git a/NEWS b/NEWS
index 1ef1175d0..d02733936 100644
--- a/NEWS
+++ b/NEWS
@@ -52,6 +52,8 @@ Post-v2.17.0
The old variant is kept for backward compatibility.
  * Add actions auto-validator function to compare different actions
implementations against default implementation.
+ * Add command line option to switch between different actions
+   implementations available at run time.
- Linux datapath:
  * Add offloading meter tc police.
  * Add support for offloading the check_pkt_len action.
diff --git a/lib/automake.mk b/lib/automake.mk
index 23ba4fab0..5c3b05f6b 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -584,6 +584,7 @@ MAN_FRAGMENTS += \
lib/netdev-dpdk-unixctl.man \
lib/dpif-netdev-unixctl.man \
lib/dpif-netlink-unixctl.man \
+   lib/odp-execute-unixctl.man \
lib/ofp-version.man \
lib/ovs.tmac \
lib/ovs-replay.man \
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 780d6d289..38be22ec9 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -67,6 +67,18 @@ odp_execute_action_set(const char *name)
 return NULL;
 }
 
+void
+odp_execute_action_get_info(struct ds *string)
+{
+ds_put_cstr(string, "Available Actions implementations:\n");
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+ds_put_format(string, "  %s (available: %s, active: %s)\n",
+  action_impls[i].name,
+  action_impls[i].available ? "Yes" : "No",
+  i == active_action_impl_index ? "Yes" : "No");
+}
+}
+
 void
 odp_execute_action_init(void)
 {
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 074a8d67e..d6eebbf37 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -84,4 +84,6 @@ struct odp_execute_action_impl * odp_execute_action_set(const 
char *name);
 
 int action_autoval_init(struct odp_execute_action_impl *self);
 
+void odp_execute_action_get_info(struct ds *name);
+
 #endif /* ODP_EXTRACT_PRIVATE */
diff --git a/lib/odp-execute-unixctl.man b/lib/odp-execute-unixctl.man
new file mode 100644
index 0..82d51e1d3
--- /dev/null
+++ b/lib/odp-execute-unixctl.man
@@ -0,0 +1,10 @@
+.SS "ODP-EXECUTE COMMANDS"
+These commands manage the "odp-execute" component.
+
+.IP "\fBodp-execute/action-impl-show\fR
+Lists the actions implementations that are available and highlights the
+currently enabled one.
+.
+.IP "\fBodp-execute/action-impl-set\fR \fIaction_impl\fR"
+Sets the action implementation to any available implementation. By default
+"scalar" is used.
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 368876f27..d5be190e0 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -39,6 +39,7 @@
 #include "csum.h"
 #include "conntrack.h"
 #include "openvswitch/vlog.h"
+#include "unixctl.h"
 
 VLOG_DEFINE_THIS_MODULE(odp_execute);
 COVERAGE_DEFINE(datapath_drop_sample_error);
@@ -879,6 +880,48 @@ odp_actions_impl_set(const char *name)
 
 }
 
+static void
+action_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
+const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+struct ds reply = DS_EMPTY_INITIALIZER;
+
+int err = odp_actions_impl_set(argv[1]);
+if (err) {
+ds_put_format(,
+  "Error: unknown action implementation, %s, specified!\n",
+  argv[1]);
+unixctl_command_reply_error(conn, ds_cstr());
+} else {
+ds_put_format(, "Action implementation set to %s.\n", argv[1]);
+unixctl_command_reply(conn, ds

[ovs-dev] [PATCH v10 03/10] odp-execute: Add auto validation function for actions.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit introduced the auto-validation function which
allows users to compare the batch of packets obtained from
different action implementations against the linear
action implementation.

The autovalidator function can be triggered at runtime using the
following command:

$ ovs-appctl odp-execute/action-impl-set autovalidator

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 
---
 NEWS  |  2 +
 lib/dp-packet.c   | 24 ++
 lib/dp-packet.h   |  4 ++
 lib/odp-execute-private.c | 99 +++
 lib/odp-execute-private.h |  6 +++
 5 files changed, 135 insertions(+)

diff --git a/NEWS b/NEWS
index 6001aeb1d..1ef1175d0 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,8 @@ Post-v2.17.0
  * 'dpif-netdev/subtable-lookup-prio-get' appctl command renamed to
'dpif-netdev/subtable-lookup-info-get' to better reflect its purpose.
The old variant is kept for backward compatibility.
+ * Add actions auto-validator function to compare different actions
+   implementations against default implementation.
- Linux datapath:
  * Add offloading meter tc police.
  * Add support for offloading the check_pkt_len action.
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 35c72542a..4538d2a61 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -506,3 +506,27 @@ dp_packet_resize_l2(struct dp_packet *b, int increment)
 dp_packet_adjust_layer_offset(>l2_5_ofs, increment);
 return dp_packet_data(b);
 }
+
+bool
+dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2,
+  struct ds *err_str)
+{
+if ((b1->l2_pad_size != b2->l2_pad_size) ||
+(b1->l2_5_ofs != b2->l2_5_ofs) ||
+(b1->l3_ofs != b2->l3_ofs) ||
+(b1->l4_ofs != b2->l4_ofs)) {
+if (err_str) {
+ds_put_format(err_str, "Packet offset comparison failed\n");
+ds_put_format(err_str, "Buffer 1 offsets: l2_pad_size %u,"
+  " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n",
+  b1->l2_pad_size, b1->l2_5_ofs,
+  b1->l3_ofs, b1->l4_ofs);
+ds_put_format(err_str, "Buffer 2 offsets: l2_pad_size %u,"
+  " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n",
+  b2->l2_pad_size, b2->l2_5_ofs,
+  b2->l3_ofs, b2->l4_ofs);
+}
+return false;
+}
+return true;
+}
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index eea5a9215..55eeaab2c 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -236,6 +236,10 @@ void *dp_packet_steal_data(struct dp_packet *);
 static inline bool dp_packet_equal(const struct dp_packet *,
const struct dp_packet *);
 
+bool dp_packet_compare_offsets(struct dp_packet *good,
+   struct dp_packet *test,
+   struct ds *err_str);
+
 
 /* Frees memory that 'b' points to, as well as 'b' itself. */
 static inline void
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index c1d153c6e..780d6d289 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -30,6 +30,12 @@ VLOG_DEFINE_THIS_MODULE(odp_execute_impl);
 static int active_action_impl_index;
 
 static struct odp_execute_action_impl action_impls[] = {
+[ACTION_IMPL_AUTOVALIDATOR] = {
+.available = false,
+.name = "autovalidator",
+.init_func = action_autoval_init,
+},
+
 [ACTION_IMPL_SCALAR] = {
 .available = false,
 .name = "scalar",
@@ -104,3 +110,96 @@ odp_execute_action_init(void)
 }
 }
 }
+
+/* Init sequence required to be scalar first to pick up the default scalar
+* implementations, allowing over-riding of the optimized functions later.
+*/
+BUILD_ASSERT_DECL(ACTION_IMPL_SCALAR == 0);
+BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
+
+/* Loop over packets, and validate each one for the given action. */
+static void
+action_autoval_generic(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct odp_execute_action_impl *scalar = _impls[ACTION_IMPL_SCALAR];
+enum ovs_action_attr attr_type = nl_attr_type(a);
+struct dp_packet_batch original_batch;
+bool failed = false;
+
+dp_packet_batch_clone(_batch, batch);
+
+scalar->funcs[attr_type](batch, a);
+
+for (int impl = ACTION_IMPL_BEGIN; impl < ACTION_IMPL_MAX; impl++) {
+/* Clone original batch and execute implementation under test. */
+struct dp_packet_batch test_batch;
+
+dp_packet_batch_clone(_batch, _batch);
+action_impls[impl].funcs[attr_type](_batch, a);
+
+/* Loop over implementations, checking each one. */
+for (int pidx = 0; pidx < original_batch.count; pidx++) {
+struct dp_

[ovs-dev] [PATCH v10 02/10] odp-execute: Add function pointer for pop_vlan action.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit removes the pop_vlan action from the large switch
and creates a separate function for batched processing. A function
pointer is also added to call the new batched function for the pop_vlan
action.

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 
---
 lib/odp-execute-private.c | 16 +++-
 lib/odp-execute-private.h |  5 +
 lib/odp-execute.c | 32 ++--
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 2c30ed05b..c1d153c6e 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -33,7 +33,7 @@ static struct odp_execute_action_impl action_impls[] = {
 [ACTION_IMPL_SCALAR] = {
 .available = false,
 .name = "scalar",
-.init_func = NULL,
+.init_func = odp_action_scalar_init,
 },
 };
 
@@ -88,5 +88,19 @@ odp_execute_action_init(void)
 
 VLOG_INFO("Action implementation %s (available: %s)",
   action_impls[i].name, avail ? "Yes" : "No");
+
+/* The following is a run-time check to make sure a scalar
+ * implementation exists for the given ISA implementation. This is to
+ * make sure the autovalidator works as expected. */
+if (avail && i != ACTION_IMPL_SCALAR) {
+for (int j = 0; j < __OVS_ACTION_ATTR_MAX; j++) {
+/* No ovs_assert(), as it can be compiled out. */
+if (action_impls[ACTION_IMPL_SCALAR].funcs[j] == NULL
+&& action_impls[i].funcs[j] != NULL) {
+ovs_assert_failure(OVS_SOURCE_LOCATOR, __func__,
+   "Missing scalar action function!");
+}
+}
+}
 }
 }
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 24126cdca..ae06fbc09 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -71,6 +71,11 @@ BUILD_ASSERT_DECL(ACTION_IMPL_SCALAR == 0);
  */
 void odp_execute_action_init(void);
 
+/* Init functions for the action implementations. Initializes the function
+ * pointers for optimized action types.
+ */
+int odp_action_scalar_init(struct odp_execute_action_impl *self);
+
 struct odp_execute_action_impl * odp_execute_action_set(const char *name);
 
 #endif /* ODP_EXTRACT_PRIVATE */
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 7f998add6..368876f27 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -834,6 +834,30 @@ requires_datapath_assistance(const struct nlattr *a)
 return false;
 }
 
+static void
+action_pop_vlan(struct dp_packet_batch *batch,
+const struct nlattr *a OVS_UNUSED)
+{
+struct dp_packet *packet;
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+eth_pop_vlan(packet);
+}
+}
+
+/* Implementation of the scalar actions impl init function. Build up the
+ * array of func ptrs here.
+ */
+int
+odp_action_scalar_init(struct odp_execute_action_impl *self)
+{
+/* Set function pointers for actions that can be applied directly, these
+ * are identified by OVS_ACTION_ATTR_*. */
+self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_pop_vlan;
+
+return 0;
+}
+
 /* The active function pointers on the datapath. ISA optimized implementations
  * are enabled by plugging them into this static arary, which is consulted when
  * applying actions on the datapath.
@@ -982,12 +1006,6 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 break;
 }
 
-case OVS_ACTION_ATTR_POP_VLAN:
-DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-eth_pop_vlan(packet);
-}
-break;
-
 case OVS_ACTION_ATTR_PUSH_MPLS: {
 const struct ovs_action_push_mpls *mpls = nl_attr_get(a);
 
@@ -1138,6 +1156,8 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 case OVS_ACTION_ATTR_CT:
 case OVS_ACTION_ATTR_UNSPEC:
 case __OVS_ACTION_ATTR_MAX:
+/* The following actions are handled by the scalar implementation. */
+case OVS_ACTION_ATTR_POP_VLAN:
 OVS_NOT_REACHED();
 }
 
-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v10 00/10] Actions Infrastructure + Optimizations

2022-07-13 Thread Harry van Haaren



This patchset introduces actions infrastructure changes which allows
the user to choose between different action implementations based on
CPU ISA by using different commands.  The infrastructure also
provides a way to check the correctness of the ISA optimized action
version against the scalar version.

This series also introduces optimized versions of the following
actions:
 - push_vlan
 - pop_vlan
 - set_masked eth
 - set_masked ipv4

Below is a table indicating some relative performance benefits for
these actions.
+---+---+-+
| Actions   | Scalar with series| AVX with 
series |
+---+---+-+
| mod_dl_dst| 1.01x | 1.13x 
  |
+---+---+-+
| push_vlan | 1.01x | 1.10x 
  |
+---+---+-+
| strip_vlan| 1.01x | 1.11x 
  |
+---+---+-+
| mod_ipv4 1 x field| 1.01x | 1.02x 
  |
+---+---+-+
| mod_ipv4 4 x fields   | 1.01x | 1.21x 
  |
+---+---+-+
| strip_vlan + mod_dl_dst + mod_ipv4 4 x fields | 1.01x | 1.36x 
  |
+---+---+-+

---
V10;
- Fixed CI build issue on OSX around AVX512 linking (jenkins CI)
- Moved docs and reworded sections (thanks Ilya for feedback)
- Reworked one instance of <= OVS_ATTR_MAX back to original form(Eelco)
---
v9:
- Moved avx512 probe and init functions to later patch.
- Dependency on userspace datapath has been resolved.
- Fixed up comments from Sunil as posted on v8
- Note: Harry is sending this patchset, but it is Emma's rework,
   except for rebasing to lastest git, and very minor fixups.

---
v8
- First patch changing unit tests has been removed from series.
- AVX checksum implementation has been reworked.
---
v7:
- Fix review comments from Eelco.
---
v6:
- Rebase to master
- Add ISA implementation of set_masked eth and ipv4 actions
- Fix incorrect checksums in input packets for ofproto-dpif unit
tests
---
v5:
- Rebase to master
- Minor change to variable names
- Added Tags from Harry.
---
v4:
- Rebase to master
- Add ISA implementation of push_vlan action
---
v3:
- Refactored to fix unit test failures
- Removed some sign-off on commits
---
v2:
- Fix the CI build issues
---

Emma Finn (8):
  odp-execute: Add function pointers to odp-execute for different action
implementations.
  odp-execute: Add function pointer for pop_vlan action.
  odp-execute: Add auto validation function for actions.
  odp-execute: Add command to switch action implementation.
  odp-execute: Add ISA implementation of actions.
  odp-execute: Add ISA implementation of push_vlan action.
  odp-execute: Add ISA implementation of set_masked ETH
  odp-execute: Add ISA implementation of set_masked IPv4 action

Harry van Haaren (1):
  odp-execute: Add ISA implementation of pop_vlan action.

Kumar Amber (1):
  acinclude: Add configure option to enable actions autovalidator at
build time.

 Documentation/topics/dpdk/bridge.rst |  30 ++
 Documentation/topics/testing.rst |  24 +-
 NEWS |   7 +
 acinclude.m4 |  21 ++
 configure.ac |   1 +
 lib/automake.mk  |   7 +
 lib/cpu.c|   1 +
 lib/cpu.h|   1 +
 lib/dp-packet.c  |  24 ++
 lib/dp-packet.h  |   4 +
 lib/odp-execute-avx512.c | 535 +++
 lib/odp-execute-private.c| 270 ++
 lib/odp-execute-private.h| 106 ++
 lib/odp-execute-unixctl.man  |  10 +
 lib/odp-execute.c| 202 --
 lib/odp-execute.h|  10 +
 m4/openvswitch.m4|  29 ++
 tests/ofproto-macros.at  |   1 +
 tests/pmd.at |  39 ++
 vswitchd/bridge.c|   3 +
 vswitchd/ovs-vswitchd.8.in   |   1 +
 21 files changed, 1280 insertions(+), 46 deletions(-)
 create mode 100644 lib/odp-execute-avx512.c
 create mode 100644 lib/odp-execute-private.c
 create mode 100644 lib/odp-execute-private.h
 create mode 100644 lib/odp-execute-unixctl.man

-- 
2.32.0

___
dev mailing lis

[ovs-dev] [PATCH v10 01/10] odp-execute: Add function pointers to odp-execute for different action implementations.

2022-07-13 Thread Harry van Haaren

From: Emma Finn 

This commit introduces the initial infrastructure required to allow
different implementations for OvS actions. The patch introduces action
function pointers which allows user to switch between different action
implementations available. This will allow for more performance and flexibility
so the user can choose the action implementation to best suite their use case.

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 

---

v10:
- switch from < __OVS_ATTR_MAX back to <= OVS_ATTR_MAX (Eelco)

v9:
- rebase conflicts on NEWS
- As Actions is initialized in vswitchd/bridge.c now, logs are output on
  startup, which have been added to the tests/ofproto-macros.
---
 lib/automake.mk   |  2 +
 lib/odp-execute-private.c | 92 +++
 lib/odp-execute-private.h | 76 
 lib/odp-execute.c | 51 +-
 lib/odp-execute.h |  7 +++
 tests/ofproto-macros.at   |  1 +
 vswitchd/bridge.c |  3 ++
 7 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 lib/odp-execute-private.c
 create mode 100644 lib/odp-execute-private.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 1d00cfa20..23ba4fab0 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -216,6 +216,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/object-collection.h \
lib/odp-execute.c \
lib/odp-execute.h \
+   lib/odp-execute-private.c \
+   lib/odp-execute-private.h \
lib/odp-util.c \
lib/odp-util.h \
lib/ofp-actions.c \
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
new file mode 100644
index 0..2c30ed05b
--- /dev/null
+++ b/lib/odp-execute-private.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "dpdk.h"
+#include "dp-packet.h"
+#include "odp-execute-private.h"
+#include "odp-netlink.h"
+#include "odp-util.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(odp_execute_impl);
+static int active_action_impl_index;
+
+static struct odp_execute_action_impl action_impls[] = {
+[ACTION_IMPL_SCALAR] = {
+.available = false,
+.name = "scalar",
+.init_func = NULL,
+},
+};
+
+static void
+action_impl_copy_funcs(struct odp_execute_action_impl *dest,
+   const struct odp_execute_action_impl *src)
+{
+for (int i = 0; i < __OVS_ACTION_ATTR_MAX; i++) {
+atomic_store_relaxed(>funcs[i], src->funcs[i]);
+}
+}
+
+struct odp_execute_action_impl *
+odp_execute_action_set(const char *name)
+{
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+/* String compare, and set ptrs atomically. */
+if (!strcmp(action_impls[i].name, name)) {
+active_action_impl_index = i;
+
+VLOG_INFO("Action implementation set to %s", name);
+return _impls[i];
+}
+}
+return NULL;
+}
+
+void
+odp_execute_action_init(void)
+{
+/* Each impl's function array is initialized to reflect the scalar
+ * implementation. This simplifies adding optimized implementations,
+ * as the autovalidator can always compare all actions.
+ *
+ * Below will check if impl is available and copies the scalar functions
+ * to all other implementations.
+ */
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+bool avail = true;
+
+if (i != ACTION_IMPL_SCALAR) {
+action_impl_copy_funcs(_impls[i],
+   _impls[ACTION_IMPL_SCALAR]);
+}
+
+if (action_impls[i].init_func) {
+/* Return zero is success, non-zero means error. */
+avail = (action_impls[i].init_func(_impls[i]) == 0);
+}
+
+action_impls[i].available = avail;
+
+VLOG_INFO("Action implementation %s (available: %s)",
+  action_impls[i].name, avail ? "Yes" : "No");
+}
+}
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
new file mode 100644
index 0..24126cdca
--- /dev/null
+++ b/lib/odp-execute-private.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use th

[ovs-dev] [PATCH v9 10/10] odp-execute: Add ISA implementation of set_masked IPv4 action

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c | 208 +++
 1 file changed, 208 insertions(+)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 8ecdaecf6..a0c97f312 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 
+#include "csum.h"
 #include "dp-packet.h"
 #include "immintrin.h"
 #include "odp-execute.h"
@@ -58,6 +59,22 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ethernet, eth_src) 
+
   MEMBER_SIZEOF(struct ovs_key_ethernet, eth_src) ==
   offsetof(struct ovs_key_ethernet, eth_dst));
 
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
 /* Array of callback functions, one for each masked operation. */
 odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
 
@@ -279,6 +296,196 @@ action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
 }
 }
 
+static inline uint16_t ALWAYS_INLINE
+avx512_get_delta(__m256i old_header, __m256i res)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+ * old and new header to add padding after each 16-bit value for the
+ * following carry over addition. */
+__m256i v_swap16a = _mm256_setr_epi16(0x0100, 0x, 0x0302, 0x,
+  0x0504, 0x, 0x0706, 0x,
+  0x0100, 0x, 0x0302, 0x,
+  0x, 0x, 0x, 0x);
+__m256i v_swap16b = _mm256_setr_epi16(0x0908, 0x, 0x0B0A, 0x,
+  0x0D0C, 0x, 0x0F0E, 0x,
+  0x, 0x, 0x, 0x,
+  0x, 0x, 0x, 0x);
+__m256i v_shuf_old1 = _mm256_shuffle_epi8(old_header, v_swap16a);
+__m256i v_shuf_old2 = _mm256_shuffle_epi8(old_header, v_swap16b);
+__m256i v_shuf_new1 = _mm256_shuffle_epi8(res, v_swap16a);
+__m256i v_shuf_new2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+/* Add each part of the old and new headers together. */
+__m256i v_delta1 = _mm256_add_epi32(v_shuf_old1, v_shuf_new1);
+__m256i v_delta2 = _mm256_add_epi32(v_shuf_old2, v_shuf_new2);
+
+/* Add old and new header. */
+__m256i v_delta = _mm256_add_epi32(v_delta1, v_delta2);
+
+/* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+/* Shuffle 32-bit value from 3rd lane into first lane for final
+ * horizontal add. */
+__m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+  0xF, 0xF, 0xF, 0xF);
+v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+
+v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+/* Extract delta value. */
+delta = _mm256_extract_epi16(v_delta, 0);
+
+return delta;
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(__m256i old_header, __m256i res)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/* Set the v_ones register to all one's. */
+__m256i v_ones = _mm256_cmpeq_epi16(v_zeros, v_zeros);
+
+/* Combine the old and new header, i.e. adding in the new IP addresses
+ * in the old header (oh). This is done by using the 0x03C 16-bit mask,
+ * picking 16-bit word 7 till 10.  */
+__m256i v_blend_new = _mm256_mask_blend_epi16(0x03C0, old_header, res);
+
+/* Invert the old_header register. */
+old_header =_mm256_andnot_si256(old_header, v_ones);
+
+/* Calculate the delta between the old and new header. */
+delta = avx512_get_delta(old_header, v_blend_new);
+
+return delta;
+
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_update_csum(__m256i res, __m256i old_header)
+{
+__m256i v_zeros = _mm256_setzero_si256();
+uint16_t delta;
+
+/*

[ovs-dev] [PATCH v9 09/10] odp-execute: Add ISA implementation of set_masked ETH

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit includes infrastructure changes for enabling set_masked_X
actions and also adds support for the AVX512 implementation of the
eth_set_addrs action.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c  | 90 +++
 lib/odp-execute-private.c | 14 ++
 lib/odp-execute-private.h |  3 ++
 lib/odp-execute.c | 49 +++--
 lib/odp-execute.h |  3 ++
 5 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 3449acff7..8ecdaecf6 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -23,6 +23,7 @@
 
 #include "dp-packet.h"
 #include "immintrin.h"
+#include "odp-execute.h"
 #include "odp-execute-private.h"
 #include "odp-netlink.h"
 #include "openvswitch/vlog.h"
@@ -50,6 +51,16 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
 BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
   offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
 
+/* The below build assert makes sure the order of the fields needed by
+ * the set masked functions shuffle operations do not change. This should not
+ * happen as these are defined under the Linux uapi. */
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ethernet, eth_src) +
+  MEMBER_SIZEOF(struct ovs_key_ethernet, eth_src) ==
+  offsetof(struct ovs_key_ethernet, eth_dst));
+
+/* Array of callback functions, one for each masked operation. */
+odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
+
 static inline void ALWAYS_INLINE
 avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
 {
@@ -207,6 +218,80 @@ action_avx512_push_vlan(struct dp_packet_batch *batch, 
const struct nlattr *a)
 }
 }
 
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_eth_set_addrs() function. */
+static void
+action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
+const struct nlattr *a)
+{
+const struct ovs_key_ethernet *key, *mask;
+struct dp_packet *packet;
+
+a = nl_attr_get(a);
+key = nl_attr_get(a);
+mask = odp_get_key_mask(a, struct ovs_key_ethernet);
+
+/* Read the content of the key(src) and mask in the respective registers.
+ * We only load the src and dest addresses, which is only 96-bits and not
+ * 128-bits. */
+__m128i v_src = _mm_maskz_loadu_epi32(0x7,(void *) key);
+__m128i v_mask = _mm_maskz_loadu_epi32(0x7, (void *) mask);
+
+
+/* These shuffle masks are used below, and each position tells where to
+ * move the bytes to. So here, the fourth sixth byte in
+ * ovs_key_ethernet is moved to byte location 0 in v_src/v_mask.
+ * The seventh is moved to 1, etc., etc.
+ * This swap is needed to move the src and dest MAC addresses in the
+ * same order as in the ethernet packet. */
+static const uint8_t eth_shuffle[16] = {
+6, 7, 8, 9, 10, 11, 0, 1,
+2, 3, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* Load the shuffle mask in v_shuf. */
+__m128i v_shuf = _mm_loadu_si128((void *) eth_shuffle);
+
+/* Swap the key/mask src and dest addresses to the ethernet order. */
+v_src = _mm_shuffle_epi8(v_src, v_shuf);
+v_mask = _mm_shuffle_epi8(v_mask, v_shuf);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+
+struct eth_header *eh = dp_packet_eth(packet);
+
+if (!eh) {
+continue;
+}
+
+/* Load the first 128-bits of the packet into the v_ether register. */
+__m128i v_dst = _mm_loadu_si128((void *) eh);
+
+/* AND the v_mask to the packet data (v_dst). */
+__m128i dst_masked = _mm_andnot_si128(v_mask, v_dst);
+
+/* OR the new addresses (v_src) with the masked packet addresses
+ * (dst_masked). */
+__m128i res = _mm_or_si128(v_src, dst_masked);
+
+/* Write back the modified ethernet addresses. */
+_mm_storeu_si128((void *) eh, res);
+}
+}
+
+static void
+action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+const struct nlattr *mask = nl_attr_get(a);
+enum ovs_key_attr attr_type = nl_attr_type(mask);
+
+if (attr_type <= OVS_KEY_ATTR_MAX && impl_set_masked_funcs[attr_type]) {
+impl_set_masked_funcs[attr_type](batch, a);
+} else {
+odp_execute_scalar_action(batch, a);
+}
+}
+
 int
 action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
 {
@@ -214,6 +299,11 @@ action_avx512_init(struct odp_execute_action_impl *self 
OVS_UNUSED)
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
 self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_avx512_push_vlan;
+self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
+
+/* Set function pointers for the individual operations supported by the
+ * SET_MASKED action.

[ovs-dev] [PATCH v9 07/10] odp-execute: Add ISA implementation of pop_vlan action.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit adds the AVX512 implementation of the
pop_vlan action.

Signed-off-by: Emma Finn 
---
 lib/automake.mk   |   3 +-
 lib/odp-execute-avx512.c  | 182 ++
 lib/odp-execute-private.c |  33 ++-
 lib/odp-execute-private.h |   2 +
 4 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 lib/odp-execute-avx512.c

diff --git a/lib/automake.mk b/lib/automake.mk
index 5c3b05f6b..4ce5cc1ff 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -44,7 +44,8 @@ lib_libopenvswitchavx512_la_CFLAGS += \
-mavx512vl
 lib_libopenvswitchavx512_la_SOURCES += \
lib/dpif-netdev-extract-avx512.c \
-   lib/dpif-netdev-lookup-avx512-gather.c
+   lib/dpif-netdev-lookup-avx512-gather.c \
+   lib/odp-execute-avx512.c
 endif # HAVE_AVX512VL
 endif # HAVE_AVX512BW
 lib_libopenvswitchavx512_la_LDFLAGS = \
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
new file mode 100644
index 0..fd10f7f5c
--- /dev/null
+++ b/lib/odp-execute-avx512.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+#include 
+#include 
+
+#include "dp-packet.h"
+#include "immintrin.h"
+#include "odp-execute-private.h"
+#include "odp-netlink.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+
+/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs
+ * fields remain in the same order and offset to l2_padd_size. This is needed
+ * as the avx512_dp_packet_resize_l2() function will manipulate those fields at
+ * a fixed memory index based on the l2_padd_size offset. */
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) +
+  MEMBER_SIZEOF(struct dp_packet, l2_pad_size) ==
+  offsetof(struct dp_packet, l2_5_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
+  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
+  offsetof(struct dp_packet, l3_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
+   MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
+   offsetof(struct dp_packet, l4_ofs));
+
+/* The below build assert makes sure it's safe to read/write 128-bits starting
+ * at the l2_pad_size location. */
+BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
+  offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
+
+static inline void ALWAYS_INLINE
+avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
+{
+/* Update packet size/data pointers, same as the scalar implementation. */
+if (resize_by_bytes >= 0) {
+dp_packet_push_uninit(b, resize_by_bytes);
+} else {
+dp_packet_pull(b, -resize_by_bytes);
+}
+
+/* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which
+ * the scalar implementation does with the  dp_packet_adjust_layer_offset()
+ * function. */
+
+/* Set the v_zero register to all zero's. */
+const __m128i v_zeros = _mm_setzero_si128();
+
+/* Set the v_u16_max register to all one's. */
+const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+/* Each lane represents 16 bits in a 12-bit register. In this case the
+ * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and
+ * l4_ofs fields. */
+const uint8_t k_lanes = 0b1110;
+
+/* Set all 16-bit words in the 128-bits v_offset register to the value we
+ * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */
+__m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));
+
+/* Load 128 bits from the dp_packet structure starting at the l2_pad_size
+ * offset. */
+void *adjust_ptr = >l2_pad_size;
+__m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
+
+/* Here is the tricky part, we only need to update the value of the three
+ * fields if they are not UINT16_MAX. The following function will return
+ * a mask of lanes (read fields) that are not UINT16_MAX. It will do this
+ * by comparing only the lanes we requested, k_lanes, and if they match
+ * v_u16_max, the bit will be set. */
+__mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
+v_u16_max);
+
+/* Based

[ovs-dev] [PATCH v9 08/10] odp-execute: Add ISA implementation of push_vlan action.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit adds the AVX512 implementation of the
push_vlan action.

Signed-off-by: Emma Finn 
---
 lib/odp-execute-avx512.c | 55 
 lib/odp-execute.c| 22 +---
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index fd10f7f5c..3449acff7 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -154,12 +154,67 @@ action_avx512_pop_vlan(struct dp_packet_batch *batch,
 }
 }
 
+/* This function performs the same operation on each packet in the batch as
+ * the scalar eth_push_vlan() function. */
+static void
+action_avx512_push_vlan(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct dp_packet *packet;
+const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+ovs_be16 tpid, tci;
+
+/* This shuffle mask is used below, and each position tells where to
+ * move the bytes to. So here, the fourth byte in v_ether is moved to
+ * byte location 0 in v_shift. The fifth is moved to 1, etc., etc.
+ * The 0xFF is special it tells to fill that position with 0.
+ */
+static const uint8_t vlan_push_shuffle_mask[16] = {
+4, 5, 6, 7, 8, 9, 10, 11,
+12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* Load the shuffle mask in v_index. */
+__m128i v_index = _mm_loadu_si128((void *) vlan_push_shuffle_mask);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+tpid = vlan->vlan_tpid;
+tci = vlan->vlan_tci;
+
+/* As we are about to insert the VLAN_HEADER we now need to adjust all
+ * the offsets. */
+avx512_dp_packet_resize_l2(packet, VLAN_HEADER_LEN);
+
+char *pkt_data = (char *) dp_packet_data(packet);
+
+/* Build up the VLAN TCI/TPID in a single uint32_t. */
+const uint32_t tci_proc = tci & htons(~VLAN_CFI);
+const uint32_t tpid_tci = (tci_proc << 16) | tpid;
+
+/* Load the first 128-bits of the packet into the v_ether register.
+ * Note that this includes the 4 unused bytes (VLAN_HEADER_LEN). */
+__m128i v_ether = _mm_loadu_si128((void *) pkt_data);
+
+/* Move(shuffle) the veth_dst and veth_src data to create room for
+ * the vlan header. */
+__m128i v_shift = _mm_shuffle_epi8(v_ether, v_index);
+
+/* Copy(insert) the 32-bit VLAN header, tpid_tci, at the 3rd 32-bit
+ * word offset, i.e. ofssetof(vlan_eth_header, veth_type) */
+__m128i v_vlan_hdr = _mm_insert_epi32(v_shift, tpid_tci, 3);
+
+/* Write back the modified ethernet header. */
+_mm_storeu_si128((void *) pkt_data, v_vlan_hdr);
+}
+}
+
 int
 action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
 {
 /* Set function pointers for actions that can be applied directly, these
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
+self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_avx512_push_vlan;
+
 return 0;
 }
 
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index f713acabe..3f562eb34 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -846,6 +846,17 @@ action_pop_vlan(struct dp_packet_batch *batch,
 }
 }
 
+static void
+action_push_vlan(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct dp_packet *packet;
+const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
+}
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -855,6 +866,7 @@ odp_action_scalar_init(struct odp_execute_action_impl *self)
 /* Set function pointers for actions that can be applied directly, these
  * are identified by OVS_ACTION_ATTR_*. */
 self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_pop_vlan;
+self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
 
 return 0;
 }
@@ -1045,15 +1057,6 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 break;
 }
 
-case OVS_ACTION_ATTR_PUSH_VLAN: {
-const struct ovs_action_push_vlan *vlan = nl_attr_get(a);
-
-DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-eth_push_vlan(packet, vlan->vlan_tpid, vlan->vlan_tci);
-}
-break;
-}
-
 case OVS_ACTION_ATTR_PUSH_MPLS: {
 const struct ovs_action_push_mpls *mpls = nl_attr_get(a);
 
@@ -1206,6 +1209,7 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 case __OVS_ACTION_ATTR_MAX:
 /* The following actions are handled by the scalar implementation. */
 case OVS_ACTION_ATTR_POP_VLAN:
+case OVS_ACTION_ATTR_PUSH_VLAN:
 OVS_NOT_REACHED();
 }
 
-- 
2.32.0

[ovs-dev] [PATCH v9 05/10] acinclude: Add configure option to enable actions autovalidator at build time.

2022-07-12 Thread Harry van Haaren

From: Kumar Amber 

This commit adds a new command to allow the user to enable the
actions autovalidator by default at build time thus allowing for
running unit test by default.

 $ ./configure --enable-actions-default-autovalidator

Signed-off-by: Kumar Amber 
Acked-by: Harry van Haaren 

---

v9:
- rebase conflict on NEWS
- fixup missing "dnl" in comment introduced by previous line-wrapping
---
 NEWS  |  2 ++
 acinclude.m4  | 20 
 configure.ac  |  1 +
 lib/odp-execute.c |  4 
 4 files changed, 27 insertions(+)

diff --git a/NEWS b/NEWS
index cf35f4ae4..2359b6bcf 100644
--- a/NEWS
+++ b/NEWS
@@ -53,6 +53,8 @@ Post-v2.17.0
implementations against default implementation.
  * Add command line option to switch between different actions
implementations available at run time.
+ * Add build time configure command to enable auto-validator as default
+   actions implementation at build time.
- Linux datapath:
  * Add offloading meter tc police.
 
diff --git a/acinclude.m4 b/acinclude.m4
index d15f11a4e..84fad425c 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -14,6 +14,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+dnl Set OVS Actions Autovalidator as the default action implementation
+dnl at compile time This enables automatically running all unit tests
+dnl with all actions implementations.
+AC_DEFUN([OVS_CHECK_ACTIONS_AUTOVALIDATOR], [
+  AC_ARG_ENABLE([actions-default-autovalidator],
+[AC_HELP_STRING([--enable-actions-default-autovalidator],
+[Enable actions autovalidator as default
+ ovs actions implementation.])],
+[autovalidator=yes],[autovalidator=no])
+  AC_MSG_CHECKING([whether actions Autovalidator is default implementation])
+  if test "$autovalidator" != yes; then
+AC_MSG_RESULT([no])
+  else
+AC_DEFINE([ACTIONS_AUTOVALIDATOR_DEFAULT], [1],
+  [Autovalidator for actions is a default implementation.])
+AC_MSG_RESULT([yes])
+  fi
+])
+
+
 dnl Set OVS MFEX Autovalidator as default miniflow extract at compile time?
 dnl This enables automatically running all unit tests with all MFEX
 dnl implementations.
diff --git a/configure.ac b/configure.ac
index 6f8679d7c..6c51e48ce 100644
--- a/configure.ac
+++ b/configure.ac
@@ -184,6 +184,7 @@ OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], 
[HAVE_WNO_UNUSED_PARAMETER])
 OVS_ENABLE_WERROR_TOP
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
+OVS_CHECK_ACTIONS_AUTOVALIDATOR
 OVS_CHECK_DPCLS_AUTOVALIDATOR
 OVS_CHECK_DPIF_AVX512_DEFAULT
 OVS_CHECK_MFEX_AUTOVALIDATOR
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 64c058a75..f713acabe 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -928,7 +928,11 @@ odp_execute_init(void)
 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
 if (ovsthread_once_start()) {
 odp_execute_action_init();
+#ifdef ACTIONS_AUTOVALIDATOR_DEFAULT
+odp_actions_impl_set("autovalidator");
+#else
 odp_actions_impl_set("scalar");
+#endif
 odp_execute_unixctl_init();
 ovsthread_once_done();
 }
-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 06/10] odp-execute: Add ISA implementation of actions.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit adds the AVX512 implementation of the action functionality.

Usage:
  $ ovs-appctl odp-execute/action-impl-set avx512

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 

---

v9: rebase conflict on NEWS
---
 Documentation/ref/ovs-actions.7.rst | 26 ++
 Documentation/topics/testing.rst| 24 
 NEWS|  1 +
 lib/cpu.c   |  1 +
 lib/cpu.h   |  1 +
 lib/odp-execute-private.c   |  8 
 lib/odp-execute-private.h   |  6 ++
 7 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/Documentation/ref/ovs-actions.7.rst 
b/Documentation/ref/ovs-actions.7.rst
index b59b7634f..2410acc4a 100644
--- a/Documentation/ref/ovs-actions.7.rst
+++ b/Documentation/ref/ovs-actions.7.rst
@@ -125,6 +125,32 @@ the one added to the set later replaces the earlier action:
 
 An action set may only contain the actions listed above.
 
+Actions Implementations (Experimental)
+--
+
+Actions are used in OpenFlow flows to describe what to do when the flow
+matches a packet. Just like with the datapath interface, SIMD instructions
+with the userspace datapath can be applied to the action implementation to
+improve performance.
+
+OVS provides multiple implementations of the actions.
+Available implementations can be listed with the following command::
+
+$ ovs-appctl odp-execute/action-impl-show
+Available Actions implementations:
+scalar (available: Yes, active: Yes)
+autovalidator (available: Yes, active: No)
+avx512 (available: Yes, active: No)
+
+By default, ``scalar`` is used.  Implementations can be selected by
+name::
+
+$ ovs-appctl odp-execute/action-impl-set avx512
+Action implementation set to avx512.
+
+$ ovs-appctl odp-execute/action-impl-set scalar
+Action implementation set to scalar.
+
 Error Handling
 --
 
diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst
index c15d5b38f..a6c747b18 100644
--- a/Documentation/topics/testing.rst
+++ b/Documentation/topics/testing.rst
@@ -361,12 +361,12 @@ testsuite.
 Userspace datapath: Testing and Validation of CPU-specific Optimizations
 
 
-As multiple versions of the datapath classifier and packet parsing functions
-can co-exist, each with different CPU ISA optimizations, it is important to
-validate that they all give the exact same results.  To easily test all the
-implementations, an ``autovalidator`` implementation of them exists.  This
-implementation runs all other available implementations, and verifies that the
-results are identical.
+As multiple versions of the datapath classifier, packet parsing functions and
+actions can co-exist, each with different CPU ISA optimizations, it is
+important to validate that they all give the exact same results.  To easily
+test all the implementations, an ``autovalidator`` implementation of them
+exists. This implementation runs all other available implementations, and
+verifies that the results are identical.
 
 Running the OVS unit tests with the autovalidator enabled ensures all
 implementations provide the same results.  Note that the performance of the
@@ -382,18 +382,26 @@ To set the autovalidator for the packet parser, use this 
command::
 
 $ ovs-appctl dpif-netdev/miniflow-parser-set autovalidator
 
+To set the autovalidator for actions, use this command::
+
+$ ovs-appctl odp-execute/action-impl-set autovalidator
+
 To run the OVS unit test suite with the autovalidator as the default
 implementation, it is required to recompile OVS.  During the recompilation,
 the default priority of the `autovalidator` implementation is set to the
-maximum priority, ensuring every test will be run with every implementation::
+maximum priority, ensuring every test will be run with every implementation.
+Priority is only related to mfex autovalidator and not the actions
+autovalidator.::
 
-$ ./configure --enable-autovalidator --enable-mfex-default-autovalidator
+$ ./configure --enable-autovalidator --enable-mfex-default-autovalidator \
+--enable-actions-default-autovalidator
 
 The following line should be seen in the configuration log when the above
 options are used::
 
 checking whether DPCLS Autovalidator is default implementation... yes
 checking whether MFEX Autovalidator is default implementation... yes
+checking whether actions Autovalidator is default implementation... yes
 
 Compile OVS in debug mode to have `ovs_assert` statements error out if
 there is a mis-match in the datapath classifier lookup or packet parser
diff --git a/NEWS b/NEWS
index 2359b6bcf..fa2f7d535 100644
--- a/NEWS
+++ b/NEWS
@@ -55,6 +55,7 @@ Post-v2.17.0
implementations available at run time.
  * Add build time configure

[ovs-dev] [PATCH v9 03/10] odp-execute: Add auto validation function for actions.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit introduced the auto-validation function which
allows users to compare the batch of packets obtained from
different action implementations against the linear
action implementation.

The autovalidator function can be triggered at runtime using the
following command:

$ ovs-appctl odp-execute/action-impl-set autovalidator

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 
---
 NEWS  |  2 +
 lib/dp-packet.c   | 24 ++
 lib/dp-packet.h   |  4 ++
 lib/odp-execute-private.c | 99 +++
 lib/odp-execute-private.h |  6 +++
 5 files changed, 135 insertions(+)

diff --git a/NEWS b/NEWS
index 433bb1654..aeba359e5 100644
--- a/NEWS
+++ b/NEWS
@@ -49,6 +49,8 @@ Post-v2.17.0
  * 'dpif-netdev/subtable-lookup-prio-get' appctl command renamed to
'dpif-netdev/subtable-lookup-info-get' to better reflect its purpose.
The old variant is kept for backward compatibility.
+ * Add actions auto-validator function to compare different actions
+   implementations against default implementation.
- Linux datapath:
  * Add offloading meter tc police.
 
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 35c72542a..4538d2a61 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -506,3 +506,27 @@ dp_packet_resize_l2(struct dp_packet *b, int increment)
 dp_packet_adjust_layer_offset(>l2_5_ofs, increment);
 return dp_packet_data(b);
 }
+
+bool
+dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2,
+  struct ds *err_str)
+{
+if ((b1->l2_pad_size != b2->l2_pad_size) ||
+(b1->l2_5_ofs != b2->l2_5_ofs) ||
+(b1->l3_ofs != b2->l3_ofs) ||
+(b1->l4_ofs != b2->l4_ofs)) {
+if (err_str) {
+ds_put_format(err_str, "Packet offset comparison failed\n");
+ds_put_format(err_str, "Buffer 1 offsets: l2_pad_size %u,"
+  " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n",
+  b1->l2_pad_size, b1->l2_5_ofs,
+  b1->l3_ofs, b1->l4_ofs);
+ds_put_format(err_str, "Buffer 2 offsets: l2_pad_size %u,"
+  " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n",
+  b2->l2_pad_size, b2->l2_5_ofs,
+  b2->l3_ofs, b2->l4_ofs);
+}
+return false;
+}
+return true;
+}
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index eea5a9215..55eeaab2c 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -236,6 +236,10 @@ void *dp_packet_steal_data(struct dp_packet *);
 static inline bool dp_packet_equal(const struct dp_packet *,
const struct dp_packet *);
 
+bool dp_packet_compare_offsets(struct dp_packet *good,
+   struct dp_packet *test,
+   struct ds *err_str);
+
 
 /* Frees memory that 'b' points to, as well as 'b' itself. */
 static inline void
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index c1d153c6e..780d6d289 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -30,6 +30,12 @@ VLOG_DEFINE_THIS_MODULE(odp_execute_impl);
 static int active_action_impl_index;
 
 static struct odp_execute_action_impl action_impls[] = {
+[ACTION_IMPL_AUTOVALIDATOR] = {
+.available = false,
+.name = "autovalidator",
+.init_func = action_autoval_init,
+},
+
 [ACTION_IMPL_SCALAR] = {
 .available = false,
 .name = "scalar",
@@ -104,3 +110,96 @@ odp_execute_action_init(void)
 }
 }
 }
+
+/* Init sequence required to be scalar first to pick up the default scalar
+* implementations, allowing over-riding of the optimized functions later.
+*/
+BUILD_ASSERT_DECL(ACTION_IMPL_SCALAR == 0);
+BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
+
+/* Loop over packets, and validate each one for the given action. */
+static void
+action_autoval_generic(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+struct odp_execute_action_impl *scalar = _impls[ACTION_IMPL_SCALAR];
+enum ovs_action_attr attr_type = nl_attr_type(a);
+struct dp_packet_batch original_batch;
+bool failed = false;
+
+dp_packet_batch_clone(_batch, batch);
+
+scalar->funcs[attr_type](batch, a);
+
+for (int impl = ACTION_IMPL_BEGIN; impl < ACTION_IMPL_MAX; impl++) {
+/* Clone original batch and execute implementation under test. */
+struct dp_packet_batch test_batch;
+
+dp_packet_batch_clone(_batch, _batch);
+action_impls[impl].funcs[attr_type](_batch, a);
+
+/* Loop over implementations, checking each one. */
+for (int pidx = 0; pidx < original_batch.count; pidx++) {
+struct dp_packet *good_pkt = batch->packe

[ovs-dev] [PATCH v9 04/10] odp-execute: Add command to switch action implementation.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit adds a new command to allow the user to switch
the active action implementation at runtime.

Usage:
  $ ovs-appctl odp-execute/action-impl-set scalar

This commit also adds a new command to retrieve the list of available
action implementations. This can be used by to check what implementations
of actions are available and what implementation is active during runtime.

Usage:
   $ ovs-appctl odp-execute/action-impl-show

Added separate test-case for ovs-actions show/set commands:
PMD - ovs-actions configuration

Signed-off-by: Emma Finn 
Signed-off-by: Kumar Amber 
Signed-off-by: Sunil Pai G 
Co-authored-by: Kumar Amber 
Co-authored-by: Sunil Pai G 
Acked-by: Harry van Haaren 

---

v9: rebase conflict on NEWS
---
 NEWS|  2 ++
 lib/automake.mk |  1 +
 lib/odp-execute-private.c   | 12 ++
 lib/odp-execute-private.h   |  2 ++
 lib/odp-execute-unixctl.man | 10 +
 lib/odp-execute.c   | 44 +
 tests/pmd.at| 39 
 vswitchd/ovs-vswitchd.8.in  |  1 +
 8 files changed, 111 insertions(+)
 create mode 100644 lib/odp-execute-unixctl.man

diff --git a/NEWS b/NEWS
index aeba359e5..cf35f4ae4 100644
--- a/NEWS
+++ b/NEWS
@@ -51,6 +51,8 @@ Post-v2.17.0
The old variant is kept for backward compatibility.
  * Add actions auto-validator function to compare different actions
implementations against default implementation.
+ * Add command line option to switch between different actions
+   implementations available at run time.
- Linux datapath:
  * Add offloading meter tc police.
 
diff --git a/lib/automake.mk b/lib/automake.mk
index 23ba4fab0..5c3b05f6b 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -584,6 +584,7 @@ MAN_FRAGMENTS += \
lib/netdev-dpdk-unixctl.man \
lib/dpif-netdev-unixctl.man \
lib/dpif-netlink-unixctl.man \
+   lib/odp-execute-unixctl.man \
lib/ofp-version.man \
lib/ovs.tmac \
lib/ovs-replay.man \
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 780d6d289..38be22ec9 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -67,6 +67,18 @@ odp_execute_action_set(const char *name)
 return NULL;
 }
 
+void
+odp_execute_action_get_info(struct ds *string)
+{
+ds_put_cstr(string, "Available Actions implementations:\n");
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+ds_put_format(string, "  %s (available: %s, active: %s)\n",
+  action_impls[i].name,
+  action_impls[i].available ? "Yes" : "No",
+  i == active_action_impl_index ? "Yes" : "No");
+}
+}
+
 void
 odp_execute_action_init(void)
 {
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 074a8d67e..d6eebbf37 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -84,4 +84,6 @@ struct odp_execute_action_impl * odp_execute_action_set(const 
char *name);
 
 int action_autoval_init(struct odp_execute_action_impl *self);
 
+void odp_execute_action_get_info(struct ds *name);
+
 #endif /* ODP_EXTRACT_PRIVATE */
diff --git a/lib/odp-execute-unixctl.man b/lib/odp-execute-unixctl.man
new file mode 100644
index 0..82d51e1d3
--- /dev/null
+++ b/lib/odp-execute-unixctl.man
@@ -0,0 +1,10 @@
+.SS "ODP-EXECUTE COMMANDS"
+These commands manage the "odp-execute" component.
+
+.IP "\fBodp-execute/action-impl-show\fR
+Lists the actions implementations that are available and highlights the
+currently enabled one.
+.
+.IP "\fBodp-execute/action-impl-set\fR \fIaction_impl\fR"
+Sets the action implementation to any available implementation. By default
+"scalar" is used.
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index aa51476f4..64c058a75 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -39,6 +39,7 @@
 #include "csum.h"
 #include "conntrack.h"
 #include "openvswitch/vlog.h"
+#include "unixctl.h"
 
 VLOG_DEFINE_THIS_MODULE(odp_execute);
 COVERAGE_DEFINE(datapath_drop_sample_error);
@@ -879,6 +880,48 @@ odp_actions_impl_set(const char *name)
 
 }
 
+static void
+action_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
+const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+struct ds reply = DS_EMPTY_INITIALIZER;
+
+int err = odp_actions_impl_set(argv[1]);
+if (err) {
+ds_put_format(,
+  "Error: unknown action implementation, %s, specified!\n",
+  argv[1]);
+unixctl_command_reply_error(conn, ds_cstr());
+} else {
+ds_put_format(, "Action implementation set to %s.\n", argv[1]);
+unixctl_command_reply(conn, ds_cstr());
+}
+
+ds_destroy();
+}
+
+sta

[ovs-dev] [PATCH v9 02/10] odp-execute: Add function pointer for pop_vlan action.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit removes the pop_vlan action from the large switch
and creates a separate function for batched processing. A function
pointer is also added to call the new batched function for the pop_vlan
action.

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 
---
 lib/odp-execute-private.c | 16 +++-
 lib/odp-execute-private.h |  5 +
 lib/odp-execute.c | 32 ++--
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 2c30ed05b..c1d153c6e 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -33,7 +33,7 @@ static struct odp_execute_action_impl action_impls[] = {
 [ACTION_IMPL_SCALAR] = {
 .available = false,
 .name = "scalar",
-.init_func = NULL,
+.init_func = odp_action_scalar_init,
 },
 };
 
@@ -88,5 +88,19 @@ odp_execute_action_init(void)
 
 VLOG_INFO("Action implementation %s (available: %s)",
   action_impls[i].name, avail ? "Yes" : "No");
+
+/* The following is a run-time check to make sure a scalar
+ * implementation exists for the given ISA implementation. This is to
+ * make sure the autovalidator works as expected. */
+if (avail && i != ACTION_IMPL_SCALAR) {
+for (int j = 0; j < __OVS_ACTION_ATTR_MAX; j++) {
+/* No ovs_assert(), as it can be compiled out. */
+if (action_impls[ACTION_IMPL_SCALAR].funcs[j] == NULL
+&& action_impls[i].funcs[j] != NULL) {
+ovs_assert_failure(OVS_SOURCE_LOCATOR, __func__,
+   "Missing scalar action function!");
+}
+}
+}
 }
 }
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 24126cdca..ae06fbc09 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -71,6 +71,11 @@ BUILD_ASSERT_DECL(ACTION_IMPL_SCALAR == 0);
  */
 void odp_execute_action_init(void);
 
+/* Init functions for the action implementations. Initializes the function
+ * pointers for optimized action types.
+ */
+int odp_action_scalar_init(struct odp_execute_action_impl *self);
+
 struct odp_execute_action_impl * odp_execute_action_set(const char *name);
 
 #endif /* ODP_EXTRACT_PRIVATE */
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index bf8223634..aa51476f4 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -834,6 +834,30 @@ requires_datapath_assistance(const struct nlattr *a)
 return false;
 }
 
+static void
+action_pop_vlan(struct dp_packet_batch *batch,
+const struct nlattr *a OVS_UNUSED)
+{
+struct dp_packet *packet;
+
+DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+eth_pop_vlan(packet);
+}
+}
+
+/* Implementation of the scalar actions impl init function. Build up the
+ * array of func ptrs here.
+ */
+int
+odp_action_scalar_init(struct odp_execute_action_impl *self)
+{
+/* Set function pointers for actions that can be applied directly, these
+ * are identified by OVS_ACTION_ATTR_*. */
+self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_pop_vlan;
+
+return 0;
+}
+
 /* The active function pointers on the datapath. ISA optimized implementations
  * are enabled by plugging them into this static arary, which is consulted when
  * applying actions on the datapath.
@@ -982,12 +1006,6 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 break;
 }
 
-case OVS_ACTION_ATTR_POP_VLAN:
-DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
-eth_pop_vlan(packet);
-}
-break;
-
 case OVS_ACTION_ATTR_PUSH_MPLS: {
 const struct ovs_action_push_mpls *mpls = nl_attr_get(a);
 
@@ -1138,6 +1156,8 @@ odp_execute_actions(void *dp, struct dp_packet_batch 
*batch, bool steal,
 case OVS_ACTION_ATTR_CT:
 case OVS_ACTION_ATTR_UNSPEC:
 case __OVS_ACTION_ATTR_MAX:
+/* The following actions are handled by the scalar implementation. */
+case OVS_ACTION_ATTR_POP_VLAN:
 OVS_NOT_REACHED();
 }
 
-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 01/10] odp-execute: Add function pointers to odp-execute for different action implementations.

2022-07-12 Thread Harry van Haaren

From: Emma Finn 

This commit introduces the initial infrastructure required to allow
different implementations for OvS actions. The patch introduces action
function pointers which allows user to switch between different action
implementations available. This will allow for more performance and flexibility
so the user can choose the action implementation to best suite their use case.

Signed-off-by: Emma Finn 
Acked-by: Harry van Haaren 

---

v9:
- rebase conflicts on NEWS
- As Actions is initialized in vswitchd/bridge.c now, logs are output on
  startup, which have been added to the tests/ofproto-macros.
---
 lib/automake.mk   |  2 +
 lib/odp-execute-private.c | 92 +++
 lib/odp-execute-private.h | 76 
 lib/odp-execute.c | 51 +-
 lib/odp-execute.h |  7 +++
 tests/ofproto-macros.at   |  1 +
 vswitchd/bridge.c |  3 ++
 7 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 lib/odp-execute-private.c
 create mode 100644 lib/odp-execute-private.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 1d00cfa20..23ba4fab0 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -216,6 +216,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/object-collection.h \
lib/odp-execute.c \
lib/odp-execute.h \
+   lib/odp-execute-private.c \
+   lib/odp-execute-private.h \
lib/odp-util.c \
lib/odp-util.h \
lib/ofp-actions.c \
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
new file mode 100644
index 0..2c30ed05b
--- /dev/null
+++ b/lib/odp-execute-private.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "dpdk.h"
+#include "dp-packet.h"
+#include "odp-execute-private.h"
+#include "odp-netlink.h"
+#include "odp-util.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(odp_execute_impl);
+static int active_action_impl_index;
+
+static struct odp_execute_action_impl action_impls[] = {
+[ACTION_IMPL_SCALAR] = {
+.available = false,
+.name = "scalar",
+.init_func = NULL,
+},
+};
+
+static void
+action_impl_copy_funcs(struct odp_execute_action_impl *dest,
+   const struct odp_execute_action_impl *src)
+{
+for (int i = 0; i < __OVS_ACTION_ATTR_MAX; i++) {
+atomic_store_relaxed(>funcs[i], src->funcs[i]);
+}
+}
+
+struct odp_execute_action_impl *
+odp_execute_action_set(const char *name)
+{
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+/* String compare, and set ptrs atomically. */
+if (!strcmp(action_impls[i].name, name)) {
+active_action_impl_index = i;
+
+VLOG_INFO("Action implementation set to %s", name);
+return _impls[i];
+}
+}
+return NULL;
+}
+
+void
+odp_execute_action_init(void)
+{
+/* Each impl's function array is initialized to reflect the scalar
+ * implementation. This simplifies adding optimized implementations,
+ * as the autovalidator can always compare all actions.
+ *
+ * Below will check if impl is available and copies the scalar functions
+ * to all other implementations.
+ */
+for (int i = 0; i < ACTION_IMPL_MAX; i++) {
+bool avail = true;
+
+if (i != ACTION_IMPL_SCALAR) {
+action_impl_copy_funcs(_impls[i],
+   _impls[ACTION_IMPL_SCALAR]);
+}
+
+if (action_impls[i].init_func) {
+/* Return zero is success, non-zero means error. */
+avail = (action_impls[i].init_func(_impls[i]) == 0);
+}
+
+action_impls[i].available = avail;
+
+VLOG_INFO("Action implementation %s (available: %s)",
+  action_impls[i].name, avail ? "Yes" : "No");
+}
+}
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
new file mode 100644
index 0..24126cdca
--- /dev/null
+++ b/lib/odp-execute-private.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a

[ovs-dev] [PATCH v9 00/10] Actions Infrastructure + Optimizations

2022-07-12 Thread Harry van Haaren

This patchset introduces actions infrastructure changes which allows
the user to choose between different action implementations based on
CPU ISA by using different commands.  The infrastructure also
provides a way to check the correctness of the ISA optimized action
version against the scalar version.

This series also introduces optimized versions of the following
actions:
 - push_vlan
 - pop_vlan
 - set_masked eth
 - set_masked ipv4

Below is a table indicating some relative performance benefits for
these actions.
+---+---+-+
| Actions   | Scalar with series| AVX with 
series |
+---+---+-+
| mod_dl_dst| 1.01x | 1.13x 
  |
+---+---+-+
| push_vlan | 1.01x | 1.10x 
  |
+---+---+-+
| strip_vlan| 1.01x | 1.11x 
  |
+---+---+-+
| mod_ipv4 1 x field| 1.01x | 1.02x 
  |
+---+---+-+
| mod_ipv4 4 x fields   | 1.01x | 1.21x 
  |
+---+---+-+
| strip_vlan + mod_dl_dst + mod_ipv4 4 x fields | 1.01x | 1.36x 
  |
+---+---+-+

---
v9:
- Moved avx512 probe and init functions to later patch.
- Dependency on userspace datapath has been resolved.
- Fixed up comments from Sunil as posted on v8
- Note: Harry is sending this patchset, but it is Emma's rework,
  except for rebasing to lastest git, and 2 very minor fixups.
---
v8
- First patch changing unit tests has been removed from series.
- AVX checksum implementation has been reworked.
---
v7:
- Fix review comments from Eelco.
---
v6:
- Rebase to master
- Add ISA implementation of set_masked eth and ipv4 actions
- Fix incorrect checksums in input packets for ofproto-dpif unit
tests
---
v5:
- Rebase to master
- Minor change to variable names
- Added Tags from Harry.
---
v4:
- Rebase to master
- Add ISA implementation of push_vlan action
---
v3:
- Refactored to fix unit test failures
- Removed some sign-off on commits
---
v2:
- Fix the CI build issues
---


Emma Finn (9):
  odp-execute: Add function pointers to odp-execute for different action
implementations.
  odp-execute: Add function pointer for pop_vlan action.
  odp-execute: Add auto validation function for actions.
  odp-execute: Add command to switch action implementation.
  odp-execute: Add ISA implementation of actions.
  odp-execute: Add ISA implementation of pop_vlan action.
  odp-execute: Add ISA implementation of push_vlan action.
  odp-execute: Add ISA implementation of set_masked ETH
  odp-execute: Add ISA implementation of set_masked IPv4 action

Kumar Amber (1):
  acinclude: Add configure option to enable actions autovalidator at
build time.

 Documentation/ref/ovs-actions.7.rst |  26 ++
 Documentation/topics/testing.rst|  24 +-
 NEWS|   7 +
 acinclude.m4|  20 ++
 configure.ac|   1 +
 lib/automake.mk |   6 +-
 lib/cpu.c   |   1 +
 lib/cpu.h   |   1 +
 lib/dp-packet.c |  24 ++
 lib/dp-packet.h |   4 +
 lib/odp-execute-avx512.c| 535 
 lib/odp-execute-private.c   | 270 ++
 lib/odp-execute-private.h   | 100 ++
 lib/odp-execute-unixctl.man |  10 +
 lib/odp-execute.c   | 202 +--
 lib/odp-execute.h   |  10 +
 tests/ofproto-macros.at |   1 +
 tests/pmd.at|  39 ++
 vswitchd/bridge.c   |   3 +
 vswitchd/ovs-vswitchd.8.in  |   1 +
 20 files changed, 1238 insertions(+), 47 deletions(-)
 create mode 100644 lib/odp-execute-avx512.c
 create mode 100644 lib/odp-execute-private.c
 create mode 100644 lib/odp-execute-private.h
 create mode 100644 lib/odp-execute-unixctl.man

-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v2] dpcls: add unlisted alias for subtable lookup command

2022-06-07 Thread Harry van Haaren

This patch adds the old name "subtable-lookup-prio-get" as an unlisted command,
to restore a consistency between OVS releases for testing scripts.

Fixes 738c76a503f4 ("dpcls: Change info-get function to fetch dpcls usage 
stats.")
Suggested-by: Eelco Chaudron 
Suggested-by: Ilya Maximets 
Signed-off-by: Harry van Haaren 

---

v2:
- Based on discussion and push back on v1 patch here, this is a v2
  implementing the suggested "alias" method. Suggested by tags added
  for both Eelco (for alias concept) and Ilya (for unlisted concept).
  
https://patchwork.ozlabs.org/project/openvswitch/patch/20220525141014.661907-1-harry.van.haa...@intel.com/

---
 lib/dpif-netdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index ff57b3961..f46b9fe18 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1609,6 +1609,9 @@ dpif_netdev_init(void)
 unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
  0, 0, dpif_netdev_subtable_lookup_get,
  NULL);
+unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL,
+ 0, 0, dpif_netdev_subtable_lookup_get,
+ NULL);
 unixctl_command_register("dpif-netdev/dpif-impl-set",
  "dpif_implementation_name",
  1, 1, dpif_netdev_impl_set,
-- 
2.32.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] dpcls: revert subtable-lookup-prio-get name change

2022-05-25 Thread Harry van Haaren

This commit reverts the name-change that was done (prio->info).
The change breaks a user visible ovs-appctl command, resulting in
breakage of tools/scripts/user-expectation outside of the OVS repo.

This commit changes the documentation, command string, and unit tests
back to the expected "prio" string, as expected in OVS 2.17 and earlier.

Signed-off-by: Harry van Haaren 

---

This name change confusion seems to have arisen from the discussion on the v5 
version of the patch:
https://patchwork.ozlabs.org/project/openvswitch/patch/20211215041511.4097090-1-kumar.am...@intel.com/

---
 Documentation/topics/dpdk/bridge.rst |  4 ++--
 lib/dpif-netdev.c|  2 +-
 tests/pmd.at | 16 
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 1f626c7c2..314c31a47 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -179,7 +179,7 @@ these CPU ISA additions are available, and to allow the 
user to enable them.
 OVS provides multiple implementations of dpcls. The following command enables
 the user to check what implementations are available in a running instance::
 
-$ ovs-appctl dpif-netdev/subtable-lookup-info-get
+$ ovs-appctl dpif-netdev/subtable-lookup-prio-get
 Available dpcls implementations:
 autovalidator (Use count: 1, Priority: 5)
 generic (Use count: 0, Priority: 1)
@@ -195,7 +195,7 @@ above indicates that one subtable of one DPCLS port is has 
changed its lookup
 function due to the command being run. To verify the prioritization, re-run the
 get command, note the updated priority of the ``avx512_gather`` function::
 
-$ ovs-appctl dpif-netdev/subtable-lookup-info-get
+$ ovs-appctl dpif-netdev/subtable-lookup-prio-get
 Available dpcls implementations:
 autovalidator (Use count: 1, Priority: 5)
 generic (Use count: 0, Priority: 1)
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0e7a7d16e..ebbd10b24 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1605,7 +1605,7 @@ dpif_netdev_init(void)
  "[lookup_func] [prio]",
  2, 2, dpif_netdev_subtable_lookup_set,
  NULL);
-unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
+unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", "",
  0, 0, dpif_netdev_subtable_lookup_get,
  NULL);
 unixctl_command_register("dpif-netdev/dpif-impl-set",
diff --git a/tests/pmd.at b/tests/pmd.at
index e6b173dab..df7875c65 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -1130,11 +1130,11 @@ OVS_VSWITCHD_START([], [], [], [--dummy-numa 0,0])
 AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd])
 
 AT_CHECK([ovs-vsctl show], [], [stdout])
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], 
[dnl
+AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-get | grep generic], [], 
[dnl
   generic (Use count: 0, Priority: 1)
 ])
 
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep 
autovalidator], [], [dnl
+AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-get | grep 
autovalidator], [], [dnl
   autovalidator (Use count: 0, Priority: 0)
 ])
 
@@ -1142,7 +1142,7 @@ AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set 
autovalidator 3], [0],
 Lookup priority change affected 0 dpcls ports and 0 subtables.
 ])
 
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep 
autovalidator], [], [dnl
+AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-get | grep 
autovalidator], [], [dnl
   autovalidator (Use count: 0, Priority: 3)
 ])
 
@@ -1150,7 +1150,7 @@ AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set 
generic 4], [0], [dnl
 Lookup priority change affected 0 dpcls ports and 0 subtables.
 ])
 
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], 
[dnl
+AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-get | grep generic], [], 
[dnl
   generic (Use count: 0, Priority: 4)
 ])
 
@@ -1158,7 +1158,7 @@ AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set 
generic 8], [0], [dnl
 Lookup priority change affected 0 dpcls ports and 0 subtables.
 ])
 
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep generic], [], 
[dnl
+AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-get | grep generic], [], 
[dnl
   generic (Use count: 0, Priority: 8)
 ])
 
@@ -1166,7 +1166,7 @@ AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-prio-set 
autovalidator 8], [0],
 Lookup priority change affected 0 dpcls ports and 0 subtables.
 ])
 
-AT_CHECK([ovs-appctl dpif-netdev/subtable-lookup-info-get | grep 
autovalidator], [], [dnl
+AT_CHECK([ovs-app

[ovs-dev] [PATCH v2] dpif-netdev-avx512: fix ubsan shift error in bitmasks

2022-04-22 Thread Harry van Haaren

The code changes here are to handle (1 << i) shifts where 'i' is the
packet index in the batch, and 1 << 31 is an overflow of the signed '1'.

Fixed by adding UINT32_C() around the 1 character, ensuring compiler knows
the 1 is unsigned (and 32-bits). Undefined Behaviour sanitizer is now happy
with the bit-shifts at runtime.

Suggested-by: Ilya Maximets 
Signed-off-by: Harry van Haaren 

---

v2:
- Suggested improvements to change 1ULL to UINT32_C(1) (David, Eelco)
- Squashed the MFEX avx512 fixup into this patch

Thanks Ilya for the detail in the email - reworked as commit message;
https://mail.openvswitch.org/pipermail/ovs-dev/2022-April/393270.html

---
 lib/dpif-netdev-avx512.c | 10 +-
 lib/dpif-netdev-extract-avx512.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index b7131ba3f..151a945a9 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -159,7 +159,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd);
 }
 
-uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
+uint32_t lookup_pkts_bitmask = (UINT32_C(1) << batch_size) - 1;
 uint32_t iter = lookup_pkts_bitmask;
 while (iter) {
 uint32_t i = raw_ctz(iter);
@@ -183,7 +183,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
  * classifed by vector mfex else do a scalar miniflow extract
  * for that packet.
  */
-bool mfex_hit = !!(mf_mask & (1 << i));
+bool mfex_hit = !!(mf_mask & (UINT32_C(1) << i));
 
 /* Check for a partial hardware offload match. */
 if (hwol_enabled) {
@@ -204,7 +204,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 
 pkt_meta[i].bytes = dp_packet_size(packet);
 phwol_hits++;
-hwol_emc_smc_hitmask |= (1 << i);
+hwol_emc_smc_hitmask |= (UINT32_C(1) << i);
 continue;
 }
 }
@@ -227,7 +227,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 if (f) {
 rules[i] = >cr;
 emc_hits++;
-hwol_emc_smc_hitmask |= (1 << i);
+hwol_emc_smc_hitmask |= (UINT32_C(1) << i);
 continue;
 }
 }
@@ -237,7 +237,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 if (f) {
 rules[i] = >cr;
 smc_hits++;
-smc_hitmask |= (1 << i);
+smc_hitmask |= (UINT32_C(1) << i);
 continue;
 }
 }
diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index c1c1fefb6..a0fedb137 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -619,7 +619,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 };
 
 /* This packet has its miniflow created, add to hitmask. */
-hitmask |= 1 << i;
+hitmask |= UINT32_C(1) << i;
 }
 
 return hitmask;
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] dpif-netdev/mfex avx512: fix ubsan shift on bitmask

2022-04-19 Thread Harry van Haaren

This commit ensures the compiler knows the 1 bit is an unsigned 32-bit
wide 1 bit, keeping undefined sanitizer happy at runtime.

Fixes: 250ceddcc ("dpif-netdev/mfex: Add AVX512 based optimized miniflow 
extract")

Signed-off-by: Harry van Haaren 

---

 lib/dpif-netdev-extract-avx512.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index c1c1fefb6..8cd8b6c6e 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -619,7 +619,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 };
 
 /* This packet has its miniflow created, add to hitmask. */
-hitmask |= 1 << i;
+hitmask |= 1ULL << i;
 }
 
 return hitmask;
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] dpif-netdev-avx512: fix ubsan shift error in bitmasks

2022-04-19 Thread Harry van Haaren

The code changes here are to handle (1 << i) shifts where 'i' is the
packet index in the batch, and 1 << 31 is an overflow of the signed '1'.

Fixed by adding ULL suffix to the 1 character, ensuring compiler knows
the 1 is unsigned (and 32-bits minimum). Undefined Behaviour sanitizer
is now happy with the shifts at runtime.

Suggested-by: Ilya Maximets 
Signed-off-by: Harry van Haaren 

---

Thanks Ilya for the detail in the email - reworked as commit message;
https://mail.openvswitch.org/pipermail/ovs-dev/2022-April/393270.html

---
 lib/dpif-netdev-avx512.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index b7131ba3f..fdefee230 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -183,7 +183,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
  * classifed by vector mfex else do a scalar miniflow extract
  * for that packet.
  */
-bool mfex_hit = !!(mf_mask & (1 << i));
+bool mfex_hit = !!(mf_mask & (1ULL << i));
 
 /* Check for a partial hardware offload match. */
 if (hwol_enabled) {
@@ -204,7 +204,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 
 pkt_meta[i].bytes = dp_packet_size(packet);
 phwol_hits++;
-hwol_emc_smc_hitmask |= (1 << i);
+hwol_emc_smc_hitmask |= (1ULL << i);
 continue;
 }
 }
@@ -227,7 +227,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 if (f) {
 rules[i] = >cr;
 emc_hits++;
-hwol_emc_smc_hitmask |= (1 << i);
+hwol_emc_smc_hitmask |= (1ULL << i);
 continue;
 }
 }
@@ -237,7 +237,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 if (f) {
 rules[i] = >cr;
 smc_hits++;
-smc_hitmask |= (1 << i);
+smc_hitmask |= (1ULL << i);
 continue;
 }
 }
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v3] dpif-netdev: fix vlan and ipv4 parsing in avx512

2022-01-31 Thread Harry van Haaren

This commit fixes the minimum packet size for the vlan/ipv4/tcp
traffic profile, which was previously incorrectly set.

This commit also disallows any fragmented IPv4 packets from being
matched in the optimized miniflow-extract, avoiding complexity of
handling fragmented packets and using scalar fallback instead.
The DF (don't fragment) bit is now ignored, and stripped from the
resulting miniflow.

Fixes: aa85a25095 ("dpif-netdev/mfex: Add more AVX512 traffic profiles.")

Signed-off-by: Harry van Haaren 

---

Testing this patch becomes easier if the MFEX/DPIF patch by Amber here
is applied, as it ensures the AVX512 DPIF is active (and hence
MFEX-autovalidator actually executes in the datapath always, or the test
gets skipped if the ISA is not available).
https://patchwork.ozlabs.org/project/openvswitch/patch/20220131105149.1471184-1-kumar.am...@intel.com/

v3:
- Rework AVX512 impl to be more generic, adding "strip_mask" to profile
- Use #define NC for 0xFF value generation in bitmask (Eelco)
- Use previous store method (not in separate function) (Eelco/Harry)
- Handle VLAN/Dot1Q appropriately to pass MFEX Autovalidation (Amber)

v2:
- Fixup the "frag-offset" mask from incorrect value, to ignore DF bit (Eelco)
- The OVS_UNLIKELY() is added as the extra instructions/inline-func-call
  was confusing the compiler here, resulting in slow code. By marking
  the branch as unlikely, the code sequence generated is optimal again.
---
 lib/dpif-netdev-extract-avx512.c | 36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index d23349482..c1c1fefb6 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -157,7 +157,7 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i 
idx, __m512i a)
   0, 0, 0, 0, /* Src IP */  \
   0, 0, 0, 0, /* Dst IP */
 
-#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xBF, 0xFF, 0xFF)
 #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
 #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06)
 
@@ -226,6 +226,25 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
__m512i idx, __m512i a)
 #define PATTERN_DT1Q_IPV4_TCP_KMASK \
 (KMASK_ETHER | (KMASK_DT1Q << 16) | (KMASK_IPV4 << 24) | (KMASK_TCP << 40))
 
+/* Miniflow Strip post-processing masks.
+ * This allows unsetting specific bits from the resulting miniflow. It is used
+ * for e.g. IPv4 where the "DF" bit is never pushed to the miniflow itself.
+ * The NC define is for "No Change", allowing the bits to pass through.
+ */
+#define NC 0xFF
+
+#define PATTERN_STRIP_IPV4_MASK \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, 0xBF, NC, NC, NC,   \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC
+
+#define PATTERN_STRIP_DOT1Q_IPV4_MASK   \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, \
+NC, NC, NC, NC, 0xBF, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC,   \
+NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, NC
+
 /* This union allows initializing static data as u8, but easily loading it
  * into AVX512 registers too. The union ensures proper alignment for the zmm.
  */
@@ -250,8 +269,9 @@ struct mfex_profile {
 union mfex_data probe_mask;
 union mfex_data probe_data;
 
-/* Required for reshaping packet into miniflow. */
+/* Required for reshaping packet into miniflow and post-processing it. */
 union mfex_data store_shuf;
+union mfex_data strip_mask;
 __mmask64 store_kmsk;
 
 /* Constant data to set in mf.bits and dp_packet data on hit. */
@@ -319,6 +339,7 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 .probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP},
 
 .store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE },
+.strip_mask.u8_data = { PATTERN_STRIP_IPV4_MASK },
 .store_kmsk = PATTERN_IPV4_UDP_KMASK,
 
 .mf_bits = { 0x18a0, 0x00040401},
@@ -341,6 +362,7 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 },
 
 .store_shuf.u8_data = { PATTERN_IPV4_TCP_SHUFFLE },
+.strip_mask.u8_data = { PATTERN_STRIP_IPV4_MASK },
 .store_kmsk = PATTERN_IPV4_TCP_KMASK,
 
 .mf_bits = { 0x18a0, 0x00044401},
@@ -359,6 +381,7 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 },
 
 .store_shuf.u8_da

[ovs-dev] [PATCH v2] dpif-netdev: fix vlan and ipv4 parsing in avx512

2022-01-28 Thread Harry van Haaren

This commit fixes the minimum packet size for the vlan/ipv4/tcp
traffic profile, which was previously incorrectly set.

This commit also disallows any fragmented IPv4 packets from being
matched in the optimized miniflow-extract, avoiding complexity of
handling fragmented packets and using scalar fallback instead.
The DF (don't fragment) bit is now ignored, and stripped from the
resulting miniflow.

Fixes: aa85a25095 ("dpif-netdev/mfex: Add more AVX512 traffic profiles.")

Signed-off-by: Harry van Haaren 

---

v2:
- Fixup the "frag-offset" mask from incorrect value, to ignore DF bit (Eelco)
- The OVS_UNLIKELY() is added as the extra instructions/inline-func-call
  was confusing the compiler here, resulting in slow code. By marking
  the branch as unlikely, the code sequence generated is optimal again, and
  the extra AND instruction has no measurable performance impact.

---
 lib/dpif-netdev-extract-avx512.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index d23349482..a35c73510 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -157,7 +157,7 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i 
idx, __m512i a)
   0, 0, 0, 0, /* Src IP */  \
   0, 0, 0, 0, /* Dst IP */
 
-#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xBF, 0xFF, 0xFF)
 #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
 #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06)
 
@@ -389,11 +389,28 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 .dp_pkt_offs = {
 14, UINT16_MAX, 18, 38,
 },
-.dp_pkt_min_size = 46,
+.dp_pkt_min_size = 58,
 },
 };
 
 
+/* Static data to strip away the DF bit from an Eth/IPv4 miniflow. */
+static union mfex_data eth_ipv4_df_strip_mask = {
+.u8_data = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xBF, -1, -1, -1,
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+};
+
+static void ALWAYS_INLINE
+mfex_ipv4_strip_df_bit(__m512i v_blk0, __m512i v_df_mask, uint64_t *blocks)
+{
+/* strip away ipv4 DF bit. */
+__m512i v_blk0_df_strip = _mm512_and_si512(v_blk0, v_df_mask);
+_mm512_storeu_si512([2], v_blk0_df_strip);
+}
+
 /* Protocol specific helper functions, for calculating offsets/lenghts. */
 static int32_t
 mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh,
@@ -471,6 +488,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 __m512i v_vals = _mm512_loadu_si512(>probe_data);
 __m512i v_mask = _mm512_loadu_si512(>probe_mask);
 __m512i v_shuf = _mm512_loadu_si512(>store_shuf);
+__m512i v_ipv4_df_mask = _mm512_loadu_si512(_ipv4_df_strip_mask);
 
 __mmask64 k_shuf = profile->store_kmsk;
 __m128i v_bits = _mm_loadu_si128((void *) >mf_bits);
@@ -498,7 +516,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 
 __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_mask);
 __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, v_vals);
-if (k_cmp != UINT64_MAX) {
+if (OVS_UNLIKELY(k_cmp != UINT64_MAX)) {
 continue;
 }
 
@@ -526,8 +544,6 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0,
 v_shuf, v512_zeros);
 }
-_mm512_storeu_si512([2], v_blk0);
-
 
 /* Perform "post-processing" per profile, handling details not easily
  * handled in the above generic AVX512 code. Examples include TCP flag
@@ -539,6 +555,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 break;
 
 case PROFILE_ETH_VLAN_IPV4_TCP: {
+mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
 mfex_vlan_pcp(pkt[14], [i].buf[4]);
 
 uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN;
@@ -554,6 +571,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 } break;
 
 case PROFILE_ETH_VLAN_IPV4_UDP: {
+mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
 mfex_vlan_pcp(pkt[14], [i].buf[4]);
 
 uint32_t size_from_ipv4 = size - VLAN_ETH_HEADER_LEN;
@@ -565,6 +583,7 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 } break;
 
 case PROFILE_ETH_IPV4_TCP: {
+mfex_ipv4_strip_df_bit(v_blk0, v_ipv4_df_mask, blocks);
 /* Process TCP flags, and store to blocks. */

[ovs-dev] [PATCH] dpif-netdev: fix handling of vlan and ipv4 parsing in avx512

2022-01-12 Thread Harry van Haaren

This commit fixes the minimum packet size for the vlan/ipv4/tcp
traffic profile, which was previously incorrectly set.

This commit also disallows any fragmented IPv4 packets from being
matched in the optimized miniflow-extract, avoiding complexity of
handling fragmented packets and using scalar fallback instead.

Fixes: aa85a25095 ("dpif-netdev/mfex: Add more AVX512 traffic profiles.")

Signed-off-by: Harry van Haaren 

---

This patch should be applied to 2.16 as well. I expect it applies cleanly, but
volunteer to rebase/fixup on 2.16 release and send new patch if required.

---

 lib/dpif-netdev-extract-avx512.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index d23349482..7b21a3af9 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -157,7 +157,7 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i 
idx, __m512i a)
   0, 0, 0, 0, /* Src IP */  \
   0, 0, 0, 0, /* Dst IP */
 
-#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFF, 0xFF, 0xFF)
 #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
 #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06)
 
@@ -389,7 +389,7 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 .dp_pkt_offs = {
 14, UINT16_MAX, 18, 38,
 },
-.dp_pkt_min_size = 46,
+.dp_pkt_min_size = 58,
 },
 };
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v5 2/2] dpif-netdev/mfex: Optimize packet hash and enable autovalidator

2022-01-12 Thread Harry van Haaren

From: Kumar Amber 

This patch adds error checking of packet hashes to the mfex
autovalidator infrastructure, ensuring that hashes calculated by
optimized mfex implementations is identical to the scalar code.

This patch avoids calculating the software hash of the packet again
if the optimized miniflow-extract hit and has already calculated the
packet hash. In cases of scalar miniflow extract, the normal hashing
calculation is performed.

Signed-off-by: Kumar Amber 
Signed-off-by: Harry van Haaren 

---

v5:
- Always use SW hashing to validate optimized hash implementations
---
 lib/dpif-netdev-avx512.c  |  6 +++---
 lib/dpif-netdev-private-extract.c | 19 +++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index b7131ba3f..c68b79f6b 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -212,15 +212,15 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 if (!mfex_hit) {
 /* Do a scalar miniflow extract into keys. */
 miniflow_extract(packet, >mf);
+key->len = netdev_flow_key_size(miniflow_n_values(>mf));
+key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
+ >mf);
 }
 
 /* Cache TCP and byte values for all packets. */
 pkt_meta[i].bytes = dp_packet_size(packet);
 pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(>mf);
 
-key->len = netdev_flow_key_size(miniflow_n_values(>mf));
-key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >mf);
-
 if (emc_enabled) {
 f = emc_lookup(>emc_cache, key);
 
diff --git a/lib/dpif-netdev-private-extract.c 
b/lib/dpif-netdev-private-extract.c
index a29bdcfa7..2957c0172 100644
--- a/lib/dpif-netdev-private-extract.c
+++ b/lib/dpif-netdev-private-extract.c
@@ -252,8 +252,15 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch 
*packets,
 
 /* Run scalar miniflow_extract to get default result. */
 DP_PACKET_BATCH_FOR_EACH (i, packet, packets) {
+
+/* remove the NIC RSS bit to force SW hashing for validation. */
+dp_packet_reset_offload(packet);
+
 pkt_metadata_init(>md, in_port);
 miniflow_extract(packet, [i].mf);
+keys[i].len = netdev_flow_key_size(miniflow_n_values([i].mf));
+keys[i].hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
+[i].mf);
 
 /* Store known good metadata to compare with optimized metadata. */
 good_l2_5_ofs[i] = packet->l2_5_ofs;
@@ -271,7 +278,10 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch 
*packets,
 /* Reset keys and offsets before each implementation. */
 memset(test_keys, 0, keys_size * sizeof(struct netdev_flow_key));
 DP_PACKET_BATCH_FOR_EACH (i, packet, packets) {
+/* Ensure offsets is set by the opt impl. */
 dp_packet_reset_offsets(packet);
+/* Ensure packet hash is re-calculated by opt impl. */
+dp_packet_reset_offload(packet);
 }
 /* Call optimized miniflow for each batch of packet. */
 uint32_t hit_mask = mfex_impls[j].extract_func(packets, test_keys,
@@ -303,6 +313,15 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch 
*packets,
 failed = 1;
 }
 
+/* Check hashes are equal. */
+if ((keys[i].hash != test_keys[i].hash) ||
+(keys[i].len != test_keys[i].len)) {
+ds_put_format(_msg, "Good hash: %d len: %d\tTest hash:%d"
+  " len:%d\n", keys[i].hash, keys[i].len,
+  test_keys[i].hash, test_keys[i].len);
+failed = 1;
+}
+
 if (!miniflow_equal([i].mf, _keys[i].mf)) {
 uint32_t block_cnt = miniflow_n_values([i].mf);
 uint32_t test_block_cnt = miniflow_n_values(_keys[i].mf);
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v5 0/2] MFEX Hashing Optimizations

2022-01-12 Thread Harry van Haaren

Following from the MFEX Optimizations IPv6 + Hashing patchset,
https://patchwork.ozlabs.org/project/openvswitch/list/?series=275590

This patchset introduces the optimization as described at OVS Conference;
https://www.openvswitch.org/support/ovscon2021/#T32
https://youtu.be/X_uPybauF3g?list=PLaJlRa-xItwARDGAUp7lXviOgOhcRxSU-=976

The optimizations allow for simpler compute to hash the packet data, and
the mfex autovalidator is updated to compare resulting hash values. This
ensures that the hash values from optimized and scalar hashing functions
are always identical.

v5:
- Force autovalidator to always calculate and validate hash value.
- Rename "len" variable in mfex profile describe its use better.

See here for previous versions of this patchset;
https://patchwork.ozlabs.org/project/openvswitch/cover/20211207110425.3873101-1-kumar.am...@intel.com/


Kumar Amber (2):
  dpif-netdev/mfex: Add ipv4 profile based hashing
  dpif-netdev/mfex: Optimize packet hash and enable autovalidator

 NEWS  |  2 +-
 lib/dpif-netdev-avx512.c  |  6 +--
 lib/dpif-netdev-extract-avx512.c  | 65 +++
 lib/dpif-netdev-private-extract.c | 19 +
 4 files changed, 88 insertions(+), 4 deletions(-)

-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v5 1/2] dpif-netdev/mfex: Add ipv4 profile based hashing

2022-01-12 Thread Harry van Haaren

From: Kumar Amber 

This commit adds IPv4 profile specific hashing which
uses fixed offsets into the packet to improve hashing
performance.

Signed-off-by: Kumar Amber 
Signed-off-by: Harry van Haaren 
Co-authored-by: Harry van Haaren 

---

v5:
- Rename "hash_len" to "key_len" to describe its use better.
---
 NEWS |  2 +-
 lib/dpif-netdev-extract-avx512.c | 65 
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index afef81b40..e70c968a6 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,7 @@ Post-v2.16.0
- Userspace datapath:
  * Optimized flow lookups for datapath flows with simple match criteria.
See 'Simple Match Lookup' in Documentation/topics/dpdk/bridge.rst.
+ * Add IPv4 profile based 5tuple hashing optimizations.
- DPDK:
  * EAL argument --socket-mem is no longer configured by default upon
start-up.  If dpdk-socket-mem and dpdk-alloc-mem are not specified,
@@ -38,7 +39,6 @@ Post-v2.16.0
now dp_hash.  Previously this was limited to 64 buckets.  This change
is mainly for the benefit of OVN load balancing configurations.
 
-
 v2.16.0 - 16 Aug 2021
 -
- Removed support for 1024-bit Diffie-Hellman key exchange, which is now
diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index d23349482..64b1c29cb 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -258,6 +258,10 @@ struct mfex_profile {
 uint64_t mf_bits[FLOWMAP_UNITS];
 uint16_t dp_pkt_offs[4];
 uint16_t dp_pkt_min_size;
+
+/* Constant data offsets for Hashing. */
+uint8_t hash_pkt_offs[6];
+uint32_t key_len;
 };
 
 /* Ensure dp_pkt_offs[4] is the correct size as in struct dp_packet. */
@@ -307,6 +311,13 @@ enum MFEX_PROFILES {
 PROFILE_COUNT,
 };
 
+/* Packet offsets for 5 tuple Hash function. */
+#define HASH_IPV4 \
+26, 30, 23, 34, 0, 0
+
+#define HASH_DT1Q_IPV4 \
+30, 34, 27, 38, 0, 0
+
 /* Static const instances of profiles. These are compile-time constants,
  * and are specialized into individual miniflow-extract functions.
  * NOTE: Order of the fields is significant, any change in the order must be
@@ -326,6 +337,9 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 0, UINT16_MAX, 14, 34,
 },
 .dp_pkt_min_size = 42,
+
+.hash_pkt_offs = { HASH_IPV4 },
+.key_len = 72,
 },
 
 [PROFILE_ETH_IPV4_TCP] = {
@@ -348,6 +362,9 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 0, UINT16_MAX, 14, 34,
 },
 .dp_pkt_min_size = 54,
+
+.hash_pkt_offs = { HASH_IPV4 },
+.key_len = 80,
 },
 
 [PROFILE_ETH_VLAN_IPV4_UDP] = {
@@ -366,6 +383,9 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 14, UINT16_MAX, 18, 38,
 },
 .dp_pkt_min_size = 46,
+
+.hash_pkt_offs = { HASH_DT1Q_IPV4 },
+.key_len = 80,
 },
 
 [PROFILE_ETH_VLAN_IPV4_TCP] = {
@@ -390,10 +410,40 @@ static const struct mfex_profile 
mfex_profiles[PROFILE_COUNT] =
 14, UINT16_MAX, 18, 38,
 },
 .dp_pkt_min_size = 46,
+
+.hash_pkt_offs = { HASH_DT1Q_IPV4 },
+.key_len = 88,
 },
 };
 
 
+static inline void
+mfex_5tuple_hash_ipv4(struct dp_packet *packet, const uint8_t *pkt,
+  struct netdev_flow_key *key,
+  const uint8_t *pkt_offsets)
+{
+if (!dp_packet_rss_valid(packet)) {
+uint32_t hash = 0;
+void *ipv4_src = (void *) [pkt_offsets[0]];
+void *ipv4_dst = (void *) [pkt_offsets[1]];
+void *ports_l4 = (void *) [pkt_offsets[3]];
+
+/* IPv4 Src and Dst. */
+hash = hash_add(hash, *(uint32_t *) ipv4_src);
+hash = hash_add(hash, *(uint32_t *) ipv4_dst);
+/* IPv4 proto. */
+hash = hash_add(hash, pkt[pkt_offsets[2]]);
+/* L4 ports. */
+hash = hash_add(hash, *(uint32_t *) ports_l4);
+hash = hash_finish(hash, 42);
+
+dp_packet_set_rss_hash(packet, hash);
+key->hash = hash;
+} else {
+key->hash = dp_packet_get_rss_hash(packet);
+}
+}
+
 /* Protocol specific helper functions, for calculating offsets/lenghts. */
 static int32_t
 mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh,
@@ -551,6 +601,10 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 /* Process TCP flags, and store to blocks. */
 const struct tcp_header *tcp = (void *)[38];
 mfex_handle_tcp_flags(tcp, [7]);
+
+mfex_5tuple_hash_ipv4(packet, pkt, [i],
+  profile->hash_pkt_offs);
+keys[i].len = profile->key_len;
 } break;
 
 case PROFILE_ETH_VLAN_IPV4_UDP: {
@@ -562,6 +616,10 @@ mfex_avx512_process

[ovs-dev] [PATCH] dpif-netdev: improve loading of packet data for undersized packets

2022-01-06 Thread Harry van Haaren

This commit improves handling of packets where the allocated memory
is less than 64 bytes. In the DPDK datapath this never matters, as
an mbuf always pre-allocates enough space, however this can occur in
test environments such as the dummy netdev. The fix is required to
ensure ASAN enabled builds don't error on testing this, hence the
fix is valuable.

The solution implemented uses a mask-to-zero if the available buffer
size is less than 64 bytes, and a branch for which type of load is used.

Fixes: 250ceddcc2d0 ("dpif-netdev/mfex: Add AVX512 based optimized miniflow 
extract")

Reported-by: Ilya Maximets 
Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-extract-avx512.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
index e060ab14a..d23349482 100644
--- a/lib/dpif-netdev-extract-avx512.c
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -488,7 +488,14 @@ mfex_avx512_process(struct dp_packet_batch *packets,
 
 /* Load packet data and probe with AVX512 mask & compare. */
 const uint8_t *pkt = dp_packet_data(packet);
-__m512i v_pkt0 = _mm512_loadu_si512(pkt);
+__m512i v_pkt0;
+if (size >= 64) {
+v_pkt0 = _mm512_loadu_si512(pkt);
+} else {
+uint64_t load_kmask = (1ULL << size) - 1;
+v_pkt0 = _mm512_maskz_loadu_epi8(load_kmask, pkt);
+}
+
 __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_mask);
 __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, v_vals);
 if (k_cmp != UINT64_MAX) {
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v2 2/2] dpif-netdev: fix memory leak in dpif and mfex commands

2021-08-12 Thread Harry van Haaren

This patch fixes a memory leak in the commands for DPIF and MFEX
get and set. In order to operate the commands require a pmd_list,
which is currently not freed after it has been used. This issue
was identified by a static analysis tool.

Fixes: 3d8f47bc ("dpif-netdev: Add command line and function pointer for 
miniflow extract")
Fixes: abb807e2 ("dpif-netdev: Add command to switch dpif implementation.")

Signed-off-by: Harry van Haaren 

---

Maintainers; these mem leaks were introduced in this release,
so are not candidates for backporting.
---
 lib/dpif-netdev.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9e0d5c3103..37a5839684 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1025,6 +1025,7 @@ dpif_netdev_impl_get(struct unixctl_conn *conn, int argc 
OVS_UNUSED,
  * thread. */
 sorted_poll_thread_list(dp, _list, );
 dp_netdev_impl_get(, pmd_list, n);
+free(pmd_list);
 }
 ovs_mutex_unlock(_netdev_mutex);
 unixctl_command_reply(conn, ds_cstr());
@@ -1079,6 +1080,8 @@ dpif_netdev_impl_set(struct unixctl_conn *conn, int argc 
OVS_UNUSED,
 atomic_uintptr_t *pmd_func = (void *) >netdev_input_func;
 atomic_store_relaxed(pmd_func, (uintptr_t) default_func);
 };
+
+free(pmd_list);
 }
 ovs_mutex_unlock(_netdev_mutex);
 
@@ -1109,6 +1112,7 @@ dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, 
int argc OVS_UNUSED,
  * thread. */
 sorted_poll_thread_list(dp, _list, );
 dp_mfex_impl_get(, pmd_list, n);
+free(pmd_list);
 }
 ovs_mutex_unlock(_netdev_mutex);
 unixctl_command_reply(conn, ds_cstr());
@@ -1267,6 +1271,8 @@ dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, 
int argc,
 atomic_uintptr_t *pmd_func = (void *) >miniflow_extract_opt;
 atomic_store_relaxed(pmd_func, (uintptr_t) mfex_func);
 };
+
+free(pmd_list);
 }
 
 ovs_mutex_unlock(_netdev_mutex);
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v2 1/2] dpif-netdev: fix memory leak in dpcls subtable set command

2021-08-12 Thread Harry van Haaren

This patch fixes a memory leak when the command
"dpif-netdev/subtable-lookup-prio-set" is run, the pmd_list
required to iterate the PMD threads was not being freed.
This issue was identified by a static analysis tool.

Fixes: 3d018c3e ("dpif-netdev: Add subtable lookup prio set command.")

Signed-off-by: Harry van Haaren 

---

Maintainers, please consider applying this patch to 2.14 and 2.15,
which are expected to have the same issue as fixed here.
---
 lib/dpif-netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 03f460c7d1..9e0d5c3103 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -994,6 +994,7 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, 
int argc OVS_UNUSED,
 
 /* release port mutex before netdev mutex. */
 ovs_mutex_unlock(>port_mutex);
+free(pmd_list);
 }
 ovs_mutex_unlock(_netdev_mutex);
 
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] dpif: fix memory leak of pmd_list after usage

2021-08-12 Thread Harry van Haaren

This commit fixes a memory leak when a pmd_list is retrieved
from the sorted_poll_thread_list() function. Inside the function,
the pmd list is allocated, but it was not freed once no longer
required for the command functionality. These leaks were found
by a static analysis tool.

Fixes: 3d8f47bc04 ("dpif-netdev: Add command line and function pointer for 
miniflow extract")
Fixes: abb807e27d ("dpif-netdev: Add command to switch dpif implementation.")
Fixes: 3d018c3ea7 ("dpif-netdev: Add subtable lookup prio set command.")

Signed-off-by: Harry van Haaren 

---

 lib/dpif-netdev.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 03f460c7d1..99779bb402 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -960,12 +960,13 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn 
*conn, int argc OVS_UNUSED,
 }
 
 ovs_mutex_lock(_netdev_mutex);
+
+struct dp_netdev_pmd_thread **pmd_list = NULL;
 SHASH_FOR_EACH (node, _netdevs) {
 struct dp_netdev *dp = node->data;
 
 /* Get PMD threads list, required to get DPCLS instances. */
 size_t n;
-struct dp_netdev_pmd_thread **pmd_list;
 sorted_poll_thread_list(dp, _list, );
 
 /* take port mutex as HMAP iters over them. */
@@ -996,6 +997,7 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, 
int argc OVS_UNUSED,
 ovs_mutex_unlock(>port_mutex);
 }
 ovs_mutex_unlock(_netdev_mutex);
+free(pmd_list);
 
 struct ds reply = DS_EMPTY_INITIALIZER;
 ds_put_format(,
@@ -1013,10 +1015,10 @@ dpif_netdev_impl_get(struct unixctl_conn *conn, int 
argc OVS_UNUSED,
 {
 struct ds reply = DS_EMPTY_INITIALIZER;
 struct shash_node *node;
+struct dp_netdev_pmd_thread **pmd_list = NULL;
 
 ovs_mutex_lock(_netdev_mutex);
 SHASH_FOR_EACH (node, _netdevs) {
-struct dp_netdev_pmd_thread **pmd_list;
 struct dp_netdev *dp = node->data;
 size_t n;
 
@@ -1026,6 +1028,8 @@ dpif_netdev_impl_get(struct unixctl_conn *conn, int argc 
OVS_UNUSED,
 dp_netdev_impl_get(, pmd_list, n);
 }
 ovs_mutex_unlock(_netdev_mutex);
+free(pmd_list);
+
 unixctl_command_reply(conn, ds_cstr());
 ds_destroy();
 }
@@ -1058,12 +1062,12 @@ dpif_netdev_impl_set(struct unixctl_conn *conn, int 
argc OVS_UNUSED,
 return;
 }
 
+struct dp_netdev_pmd_thread **pmd_list = NULL;
 SHASH_FOR_EACH (node, _netdevs) {
 struct dp_netdev *dp = node->data;
 
 /* Get PMD threads list, required to get DPCLS instances. */
 size_t n;
-struct dp_netdev_pmd_thread **pmd_list;
 sorted_poll_thread_list(dp, _list, );
 
 for (size_t i = 0; i < n; i++) {
@@ -1080,6 +1084,7 @@ dpif_netdev_impl_set(struct unixctl_conn *conn, int argc 
OVS_UNUSED,
 };
 }
 ovs_mutex_unlock(_netdev_mutex);
+free(pmd_list);
 
 /* Reply with success to command. */
 struct ds reply = DS_EMPTY_INITIALIZER;
@@ -1099,8 +1104,8 @@ dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, 
int argc OVS_UNUSED,
 struct shash_node *node;
 
 ovs_mutex_lock(_netdev_mutex);
+struct dp_netdev_pmd_thread **pmd_list = NULL;
 SHASH_FOR_EACH (node, _netdevs) {
-struct dp_netdev_pmd_thread **pmd_list;
 struct dp_netdev *dp = node->data;
 size_t n;
 
@@ -1110,6 +1115,8 @@ dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, 
int argc OVS_UNUSED,
 dp_mfex_impl_get(, pmd_list, n);
 }
 ovs_mutex_unlock(_netdev_mutex);
+free(pmd_list);
+
 unixctl_command_reply(conn, ds_cstr());
 ds_destroy();
 }
@@ -1243,8 +1250,8 @@ dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, 
int argc,
  */
 ovs_mutex_lock(_netdev_mutex);
 
+struct dp_netdev_pmd_thread **pmd_list = NULL;
 SHASH_FOR_EACH (node, _netdevs) {
-struct dp_netdev_pmd_thread **pmd_list;
 struct dp_netdev *dp = node->data;
 size_t n;
 
@@ -1269,6 +1276,7 @@ dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, 
int argc,
 }
 
 ovs_mutex_unlock(_netdev_mutex);
+free(pmd_list);
 
 /* If PMD thread was specified, but it wasn't found, return error. */
 if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) {
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] dpcls: fix build on compilers without AVX512-VPOPCNT

2021-07-29 Thread Harry van Haaren

This commit adds extra checks around the AVX-512 vpopcnt instruction
enabling, ensuring that in the function where the ISA is enabled the
compiler has also indicated its support for the ISA. This is achieved
by checking the __AVX512VPOPCNTDQ__ define, which the compiler sets if
it is capable of handling the vpopcnt instruction.

If the compiler is not capable of handling vpopcnt, we fall back to
the emulated vpopcnt implementation.

Reported-by: Ian Stokes 
Signed-off-by: Harry van Haaren 

---

Based on a very old system with GCC 7, an issue was identified
where the compiler doesn't support the vpopcnt ISA, and resulted
in compilation failures.

---
 lib/dpif-netdev-lookup-avx512-gather.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index ced846aa77..072831e96a 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,15 +53,6 @@
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
-
-/* Wrapper function required to enable ISA. */
-static inline __m512i
-__attribute__((__target__("avx512vpopcntdq")))
-_mm512_popcnt_epi64_wrapper(__m512i v_in)
-{
-return _mm512_popcnt_epi64(v_in);
-}
-
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -85,6 +76,23 @@ _mm512_popcnt_epi64_manual(__m512i v_in)
 return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
 }
 
+/* Wrapper function required to enable ISA. First enable the ISA via the
+ * attribute target for this function, then check if the compiler actually
+ * #defines the ISA itself. If the ISA is not #define-ed by the compiler it
+ * indicates the compiler is too old or is not capable of compiling the
+ * requested ISA level, so fallback to the integer manual implementation.
+ */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+#ifdef __AVX512VPOPCNTDQ__
+return _mm512_popcnt_epi64(v_in);
+#else
+return _mm512_popcnt_epi64_manual(v_in);
+#endif
+}
+
 static inline uint64_t
 netdev_rule_matches_key(const struct dpcls_rule *rule,
 const uint32_t mf_bits_total,
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v16 08/10] dpif-netdev/dpcls: Specialize more subtable signatures.

2021-07-09 Thread Harry van Haaren

This commit adds more subtables to be specialized. The traffic
pattern here being matched is VXLAN traffic subtables, which commonly
have (5,3), (9,1) and (9,4) subtable fingerprints.

Signed-off-by: Harry van Haaren 
Acked-by: Flavio Leitner 

---

v14:
- Added Flavio's Acked-by tag.

v8: Add NEWS entry.
---
 NEWS   | 2 ++
 lib/dpif-netdev-lookup-avx512-gather.c | 6 ++
 lib/dpif-netdev-lookup-generic.c   | 6 ++
 3 files changed, 14 insertions(+)

diff --git a/NEWS b/NEWS
index 3d9255f212..57bc87b44e 100644
--- a/NEWS
+++ b/NEWS
@@ -27,6 +27,8 @@ Post-v2.15.0
  * Add a partial HWOL PMD statistic counting hits similar to existing
EMC/SMC/DPCLS stats.
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
+ * Add more specialized DPCLS subtables to cover common rules, enhancing
+   the lookup performance.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index f1b320bb6e..0b51ef9dce 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -314,6 +314,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -346,6 +349,9 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, 
uint32_t u1_bits)
 return NULL;
 }
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index e3b6be4b68..6c74ac3a1b 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -282,6 +282,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable 
*subtable,
 return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1);  \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -303,6 +306,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t 
u1_bits)
 {
 dpcls_subtable_lookup_func f = NULL;
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v16 10/10] dpcls-avx512: Enable avx512 vector popcount instruction.

2021-07-09 Thread Harry van Haaren

This commit enables the AVX512-VPOPCNTDQ Vector Popcount
instruction. This instruction is not available on every CPU
that supports the AVX512-F Foundation ISA, hence it is enabled
only when the additional VPOPCNTDQ ISA check is passed.

The vector popcount instruction is used instead of the AVX512
popcount emulation code present in the avx512 optimized DPCLS today.
It provides higher performance in the SIMD miniflow processing
as that requires the popcount to calculate the miniflow block indexes.

Signed-off-by: Harry van Haaren 
Acked-by: Flavio Leitner 

---

v14:
- Added Flavio's Acked-by tag.

v13:
- Rebased and Improved comment on use_vpop variable (Ian)
---
 NEWS   |  3 +
 lib/dpdk.c |  1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 85 --
 3 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/NEWS b/NEWS
index 57bc87b44e..6cdccc715d 100644
--- a/NEWS
+++ b/NEWS
@@ -29,6 +29,9 @@ Post-v2.15.0
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
  * Add more specialized DPCLS subtables to cover common rules, enhancing
the lookup performance.
+ * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
+   CPU supports it. This enhances performance by using the native vpopcount
+   instructions, instead of the emulated version of vpopcount.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 8807de54a1..9de2af58e1 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -706,6 +706,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
 /* CPU flags only defined for the architecture that support it. */
 CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
 CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
 
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 0b51ef9dce..bc359dc4ac 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,6 +53,15 @@
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+return _mm512_popcnt_epi64(v_in);
+}
+
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -131,6 +140,7 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
  *   pkt_mf_u0_pop: population count of bits in u0 of the packet.
  *   zero_mask: bitmask of lanes to zero as packet doesn't have mf bits set.
  *   u64_lanes_mask: bitmask of lanes to process.
+ *   use_vpop: compile-time constant indicating if VPOPCNT instruction allowed.
  */
 static inline ALWAYS_INLINE __m512i
 avx512_blocks_gather(__m512i v_u0,
@@ -141,7 +151,8 @@ avx512_blocks_gather(__m512i v_u0,
  __mmask64 u1_bcast_msk,
  const uint64_t pkt_mf_u0_pop,
  __mmask64 zero_mask,
- __mmask64 u64_lanes_mask)
+ __mmask64 u64_lanes_mask,
+ const uint32_t use_vpop)
 {
 /* Suggest to compiler to load tbl blocks ahead of gather(). */
 __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
@@ -155,8 +166,15 @@ avx512_blocks_gather(__m512i v_u0,
   tbl_mf_masks);
 __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
 
-/* Manual AVX512 popcount for u64 lanes. */
-__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+/* Calculate AVX512 popcount for u64 lanes using the native instruction
+ * if available, or using emulation if not available.
+ */
+__m512i v_popcnts;
+if (use_vpop) {
+v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
+} else {
+v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+}
 
 /* Add popcounts and offset for u1 bits. */
 __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
@@ -181,7 +199,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules,
const uint32_t bit_count_u0,
-   const uint32_t bit_count_u1)
+   const uint32_t bit_count_u1,
+   const uint32_t use_vpop)
 {
 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
 uint32_t hashes[NETDEV_MAX_BURST];
@@ -233,7 +252,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,

[ovs-dev] [PATCH v16 09/10] dpdk: Cache result of CPU ISA checks.

2021-07-09 Thread Harry van Haaren

As a small optimization, this patch caches the result of a CPU ISA
check from DPDK. Particularly in the case of running the DPCLS
autovalidator (which repeatedly probes subtables) this reduces
the amount of CPU ISA lookups from the DPDK level.

By caching them at the OVS/dpdk.c level, the ISA checks remain
runtime for the CPU where they are executed, but subsequent checks
for the same ISA feature become much cheaper.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

v14:
- Added Flavio's Acked-by tag.
---
 lib/dpdk.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 0c910092ca..8807de54a1 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -665,13 +665,33 @@ print_dpdk_version(void)
 puts(rte_version());
 }
 
+/* Avoid calling rte_cpu_get_flag_enabled() excessively, by caching the
+ * result of the call for each CPU flag in a static variable. To avoid
+ * allocating large numbers of static variables, use a uint8 as a bitfield.
+ * Note the macro must only return if the ISA check is done and available.
+ */
+#define ISA_CHECK_DONE_BIT (1 << 0)
+#define ISA_AVAILABLE_BIT  (1 << 1)
+
 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG)   \
 do {\
 if (strncmp(feature, name_str, strlen(name_str)) == 0) {\
-int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
-VLOG_DBG("CPU flag %s, available %s\n", name_str,   \
-  has_isa ? "yes" : "no");  \
-return true;\
+static uint8_t isa_check_##RTE_CPUFLAG; \
+int check = isa_check_##RTE_CPUFLAG & ISA_CHECK_DONE_BIT;   \
+if (OVS_UNLIKELY(!check)) { \
+int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
+VLOG_DBG("CPU flag %s, available %s\n", \
+ name_str, has_isa ? "yes" : "no"); \
+isa_check_##RTE_CPUFLAG = ISA_CHECK_DONE_BIT;   \
+if (has_isa) {  \
+isa_check_##RTE_CPUFLAG |= ISA_AVAILABLE_BIT;   \
+}   \
+}   \
+if (isa_check_##RTE_CPUFLAG & ISA_AVAILABLE_BIT) {  \
+return true;\
+} else {\
+return false;   \
+}   \
 }   \
 } while (0)
 
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v16 07/10] dpif-netdev/dpcls-avx512: Enable 16 block processing.

2021-07-09 Thread Harry van Haaren

This commit implements larger subtable searches in avx512. A limitation
of the previous implementation was that up to 8 blocks of miniflow
data could be matched on (so a subtable with 8 blocks was handled
in avx, but 9 blocks or more would fall back to scalar/generic).
This limitation is removed in this patch, where up to 16 blocks
of subtable can be matched on.

>From an implementation perspective, the key to enabling 16 blocks
over 8 blocks was to do bitmask calculation up front, and then use
the pre-calculated bitmasks for 2x passes of the "blocks gather"
routine. The bitmasks need to be shifted for k-mask usage in the
upper (8-15) block range, but it is relatively trivial. This also
helps in case expanding to 24 blocks is desired in future.

The implementation of the 2nd iteration to handle > 8 blocks is
behind a conditional branch which checks the total number of bits.
This helps the specialized versions of the function that have a
miniflow fingerprint of less-than-or-equal 8 blocks, as the code
can be statically stripped out of those functions. Specialized
functions that do require more than 8 blocks will have the branch
removed and unconditionally execute the 2nd blocks gather routine.

Lastly, the _any() flavour will have the conditional branch, and
the branch predictor may mispredict a bit, but per burst will
likely get most packets correct (particularly towards the middle
and end of a burst).

The code has been run with unit tests under autovalidation and
passes all cases, and unit test coverage has been checked to
ensure the 16 block code paths are executing.

Signed-off-by: Harry van Haaren 
Acked-by: Flavio Leitner 

---

v14:
- Added Flavio's Acked-by tag.

v13:
- Improve function comment including variable usage (Ian)
- Comment scope bracket usage (Ian)
---
 NEWS   |   1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 218 ++---
 2 files changed, 162 insertions(+), 57 deletions(-)

diff --git a/NEWS b/NEWS
index 6bd9b233e7..3d9255f212 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,7 @@ Post-v2.15.0
  * Add commands to get and set the dpif implementations.
  * Add a partial HWOL PMD statistic counting hits similar to existing
EMC/SMC/DPCLS stats.
+ * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 8fc1cdfa53..f1b320bb6e 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -34,7 +34,21 @@
  * AVX512 code at a time.
  */
 #define NUM_U64_IN_ZMM_REG (8)
-#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+/* This implementation of AVX512 gather allows up to 16 blocks of MF data to be
+ * present in the blocks_cache, hence the multiply by 2 in the blocks count.
+ */
+#define MF_BLOCKS_PER_PACKET (NUM_U64_IN_ZMM_REG * 2)
+
+/* Blocks cache size is the maximum number of miniflow blocks that this
+ * implementation of lookup can handle.
+ */
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * MF_BLOCKS_PER_PACKET)
+
+/* The gather instruction can handle a scale for the size of the items to
+ * gather. For uint64_t data, this scale is 8.
+ */
+#define GATHER_SCALE_8 (8)
 
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
@@ -69,22 +83,98 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
 {
 const uint64_t *keyp = miniflow_get_values(>flow.mf);
 const uint64_t *maskp = miniflow_get_values(>mask->mf);
-const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+const uint32_t lane_mask = (1ULL << mf_bits_total) - 1;
 
 /* Always load a full cache line from blocks_cache. Other loads must be
  * trimmed to the amount of data required for mf_bits_total blocks.
  */
-__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
-__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
-__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+uint32_t res_mask;
 
-__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
-uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+/* To avoid a loop, we have two iterations of a block of code here.
+ * Note the scope brackets { } are used to avoid accidental variable usage
+ * in the second iteration.
+ */
+{
+__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+}
+
+if (mf_bits_total > 8) {
+uint32_t lane_mask_gt8 = lane_mask >> 8;
+__m512i v_blocks = _mm512_l

[ovs-dev] [PATCH v16 06/10] dpif-netdev: Add a partial HWOL PMD statistic.

2021-07-09 Thread Harry van Haaren

From: Cian Ferriter 

It is possible for packets traversing the userspace datapath to match a
flow before hitting on EMC by using a mark ID provided by a NIC. Add a
PMD statistic for this hit.

Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

Cc: Gaetan Rivet 
Cc: Sriharsha Basavapatna 

v14:
- Added Flavio's Acked-by tag.

v13:
- Minor refactoring to address review comments.
- Update manpages to reflect the new format of the pmd-perf-show
  command.
---
 NEWS| 2 ++
 lib/dpif-netdev-avx512.c| 3 +++
 lib/dpif-netdev-perf.c  | 3 +++
 lib/dpif-netdev-perf.h  | 1 +
 lib/dpif-netdev-unixctl.man | 1 +
 lib/dpif-netdev.c   | 9 +++--
 tests/pmd.at| 6 --
 7 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 471f3a1a28..6bd9b233e7 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,8 @@ Post-v2.15.0
  * Add avx512 implementation of dpif which can process non recirculated
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
  * Add commands to get and set the dpif implementations.
+ * Add a partial HWOL PMD statistic counting hits similar to existing
+   EMC/SMC/DPCLS stats.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 1ae66ca6c5..6f9aa8284a 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -127,6 +127,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 
 uint32_t emc_hits = 0;
 uint32_t smc_hits = 0;
+uint32_t phwol_hits = 0;
 
 /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
 uint32_t hwol_emc_smc_hitmask = 0;
@@ -178,6 +179,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 rules[i] = >cr;
 pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
 pkt_meta[i].bytes = dp_packet_size(packet);
+phwol_hits++;
 hwol_emc_smc_hitmask |= (1 << i);
 continue;
 }
@@ -286,6 +288,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 
 /* At this point we don't return error anymore, so commit stats here. */
 pmd_perf_update_counter(>perf_stats, PMD_STAT_RECV, batch_size);
+pmd_perf_update_counter(>perf_stats, PMD_STAT_PHWOL_HIT, phwol_hits);
 pmd_perf_update_counter(>perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
 pmd_perf_update_counter(>perf_stats, PMD_STAT_SMC_HIT, smc_hits);
 pmd_perf_update_counter(>perf_stats, PMD_STAT_MASKED_HIT,
diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c
index 9560e7c3c3..7103a2d4df 100644
--- a/lib/dpif-netdev-perf.c
+++ b/lib/dpif-netdev-perf.c
@@ -246,6 +246,7 @@ pmd_perf_format_overall_stats(struct ds *str, struct 
pmd_perf_stats *s,
 ds_put_format(str,
 "  Rx packets:%12"PRIu64"  (%.0f Kpps, %.0f cycles/pkt)\n"
 "  Datapath passes:   %12"PRIu64"  (%.2f passes/pkt)\n"
+"  - PHWOL hits:  %12"PRIu64"  (%5.1f %%)\n"
 "  - EMC hits:%12"PRIu64"  (%5.1f %%)\n"
 "  - SMC hits:%12"PRIu64"  (%5.1f %%)\n"
 "  - Megaflow hits:   %12"PRIu64"  (%5.1f %%, %.2f "
@@ -255,6 +256,8 @@ pmd_perf_format_overall_stats(struct ds *str, struct 
pmd_perf_stats *s,
 rx_packets, (rx_packets / duration) / 1000,
 1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets,
 passes, rx_packets ? 1.0 * passes / rx_packets : 0,
+stats[PMD_STAT_PHWOL_HIT],
+100.0 * stats[PMD_STAT_PHWOL_HIT] / passes,
 stats[PMD_STAT_EXACT_HIT],
 100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
 stats[PMD_STAT_SMC_HIT],
diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
index 72645b6b3c..8b1a52387c 100644
--- a/lib/dpif-netdev-perf.h
+++ b/lib/dpif-netdev-perf.h
@@ -56,6 +56,7 @@ extern "C" {
 /* Set of counter types maintained in pmd_perf_stats. */
 
 enum pmd_stat_type {
+PMD_STAT_PHWOL_HIT, /* Packets that had a partial HWOL hit (phwol). */
 PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
 PMD_STAT_SMC_HIT,   /* Packets that had a sig match hit (SMC). */
 PMD_STAT_MASKED_HIT,/* Packets that matched in the flow table. */
diff --git a/lib/dpif-netdev-unixctl.man b/lib/dpif-netdev-unixctl.man
index 5f92562157..83ce4f1c50 100644
--- a/lib/dpif-netdev-unixctl.man
+++ b/lib/dpif-netdev-unixctl.man
@@ -135,6 +135,7 @@ pmd thread numa_id 0 core_id 1:
   - busy iterations:86009  ( 84.1 % of used cycles)
   Rx packets: 2399607  (2381 Kpps, 848 cycles/pkt)
   Datapath passes:3599415  (1.50 passes/pkt)
+  - PHWOL hits: 0  (  0.0 %)
   - EMC hits:  336472  (  9.3 %)
   - SMC hits:   0  ( 0.0 %)
   - Megaflow hits:

[ovs-dev] [PATCH v16 05/10] dpif-netdev: Add command to get dpif implementations.

2021-07-09 Thread Harry van Haaren

This commit adds a new command to retrieve the list of available
DPIF implementations. This can be used by to check what implementations
of the DPIF are available in any given OVS binary. It also returns which
implementations are in use by the OVS PMD threads.

Usage:
 $ ovs-appctl dpif-netdev/dpif-impl-get

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

v16:
- Include Flavio's Ack from ML

v15:
- Address Flavio's comments from the v14 review.

v14:
- Rename command to dpif-impl-get.
- Hide more of the dpif impl details from lib/dpif-netdev.c. Pass a
  dynamic_string to return the dpif-impl-get CMD output.
- Add information about which DPIF impl is currently in use by each PMD
  thread.

v13:
- Add NEWS item about DPIF get and set commands here rather than in a
  later commit.
- Add documentation items about DPIF set commands here rather than in a
  later commit.
---
 Documentation/topics/dpdk/bridge.rst |  8 +++
 NEWS |  1 +
 lib/dpif-netdev-private-dpif.c   | 31 
 lib/dpif-netdev-private-dpif.h   |  6 ++
 lib/dpif-netdev-unixctl.man  |  3 +++
 lib/dpif-netdev.c| 26 +++
 6 files changed, 75 insertions(+)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 06d1f943c1..2d0850836f 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -226,6 +226,14 @@ stats associated with the datapath.
 Just like with the SIMD DPCLS feature above, SIMD can be applied to the DPIF to
 improve performance.
 
+OVS provides multiple implementations of the DPIF. The available
+implementations can be listed with the following command ::
+
+$ ovs-appctl dpif-netdev/dpif-impl-get
+Available DPIF implementations:
+  dpif_scalar (pmds: none)
+  dpif_avx512 (pmds: 1,2,6,7)
+
 By default, dpif_scalar is used. The DPIF implementation can be selected by
 name ::
 
diff --git a/NEWS b/NEWS
index 2625cabc85..471f3a1a28 100644
--- a/NEWS
+++ b/NEWS
@@ -23,6 +23,7 @@ Post-v2.15.0
  * Refactor lib/dpif-netdev.c to multiple header files.
  * Add avx512 implementation of dpif which can process non recirculated
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
+ * Add commands to get and set the dpif implementations.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
index a05a82fa11..84d4ec156e 100644
--- a/lib/dpif-netdev-private-dpif.c
+++ b/lib/dpif-netdev-private-dpif.c
@@ -79,6 +79,37 @@ dp_netdev_impl_get_default(void)
 return default_dpif_func;
 }
 
+void
+dp_netdev_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list,
+   size_t n)
+{
+/* Add all dpif functions to reply string. */
+ds_put_cstr(reply, "Available DPIF implementations:\n");
+
+for (uint32_t i = 0; i < ARRAY_SIZE(dpif_impls); i++) {
+ds_put_format(reply, "  %s (pmds: ", dpif_impls[i].name);
+
+for (size_t j = 0; j < n; j++) {
+struct dp_netdev_pmd_thread *pmd = pmd_list[j];
+if (pmd->core_id == NON_PMD_CORE_ID) {
+continue;
+}
+
+if (pmd->netdev_input_func == dpif_impls[i].input_func) {
+ds_put_format(reply, "%u,", pmd->core_id);
+}
+}
+
+ds_chomp(reply, ',');
+
+if (ds_last(reply) == ' ') {
+ds_put_cstr(reply, "none");
+}
+
+ds_put_cstr(reply, ")\n");
+}
+}
+
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
  */
diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
index 7880647ad3..0da639c55a 100644
--- a/lib/dpif-netdev-private-dpif.h
+++ b/lib/dpif-netdev-private-dpif.h
@@ -22,6 +22,7 @@
 /* Forward declarations to avoid including files. */
 struct dp_netdev_pmd_thread;
 struct dp_packet_batch;
+struct ds;
 
 /* Typedef for DPIF functions.
  * Returns whether all packets were processed successfully.
@@ -48,6 +49,11 @@ struct dpif_netdev_impl_info_t {
 const char *name;
 };
 
+/* This function returns all available implementations to the caller. */
+void
+dp_netdev_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list,
+   size_t n);
+
 /* Returns the default DPIF which is first ./configure selected, but can be
  * overridden at runtime. */
 dp_netdev_input_func dp_netdev_impl_get_default(void);
diff --git a/lib/dpif-netdev-unixctl.man b/lib/dpif-netdev-unixctl.man
index 76cc949f9b..5f92562157 100644
--- a/lib/dpif-netdev-unixctl.man
+++ b/lib/dpif-netdev-unixctl.man
@@

[ovs-dev] [PATCH v16 04/10] dpif-netdev: Add command to switch dpif implementation.

2021-07-09 Thread Harry van Haaren

This commit adds a new command to allow the user to switch
the active DPIF implementation at runtime. A probe function
is executed before switching the DPIF implementation, to ensure
the CPU is capable of running the ISA required. For example, the
below code will switch to the AVX512 enabled DPIF assuming
that the runtime CPU is capable of running AVX512 instructions:

 $ ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512

A new configuration flag is added to allow selection of the
default DPIF. This is useful for running the unit-tests against
the available DPIF implementations, without modifying each unit test.

The design of the testing & validation for ISA optimized DPIF
implementations is based around the work already upstream for DPCLS.
Note however that a DPCLS lookup has no state or side-effects, allowing
the auto-validator implementation to perform multiple lookups and
provide consistent statistic counters.

The DPIF component does have state, so running two implementations in
parallel and comparing output is not a valid testing method, as there
are changes in DPIF statistic counters (side effects). As a result, the
DPIF is tested directly against the unit-tests.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

v16:
- Include Flavio's Ack from ML

v15:
- Address Flavio's comments from the v14 review.
- Move dp_netdev_impl_set_default_by_name() below
  dp_netdev_impl_get_by_name() since it relies on that function, and it
  is no longer prototyped in the .h file.

v14:
- Change command name to dpif-impl-set
- Fix the order of includes to what is layed out in the coding-style.rst
- Use bool not int to capture return value of dpdk_get_cpu_has_isa()
- Use an enum to index DPIF impls array.
- Hide more of the dpif impl details from lib/dpif-netdev.c.
- Fix comment on *dp_netdev_input_func() typedef.
- Rename dp_netdev_input_func func to input_func.
- Remove the datapath or dp argument from the dpif-impl-set CMD.
- Set the DPIF function pointer atomically.

v13:
- Add Docs items about the switch DPIF command here rather than in
  later commit.
- Document operation in manpages as well as rST.
- Minor code refactoring to address review comments.
---
 Documentation/topics/dpdk/bridge.rst |  34 
 acinclude.m4 |  15 
 configure.ac |   1 +
 lib/automake.mk  |   1 +
 lib/dpif-netdev-avx512.c |  14 +++
 lib/dpif-netdev-private-dpif.c   | 124 +++
 lib/dpif-netdev-private-dpif.h   |  41 +
 lib/dpif-netdev-private-thread.h |  10 ---
 lib/dpif-netdev-unixctl.man  |   3 +
 lib/dpif-netdev.c|  74 ++--
 10 files changed, 302 insertions(+), 15 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dpif.c

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 526d5c9590..06d1f943c1 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -214,3 +214,37 @@ implementation ::
 
 Compile OVS in debug mode to have `ovs_assert` statements error out if
 there is a mis-match in the DPCLS lookup implementation.
+
+Datapath Interface Performance
+--
+
+The datapath interface (DPIF) or dp_netdev_input() is responsible for taking
+packets through the major components of the userspace datapath; such as
+miniflow_extract, EMC, SMC and DPCLS lookups, and a lot of the performance
+stats associated with the datapath.
+
+Just like with the SIMD DPCLS feature above, SIMD can be applied to the DPIF to
+improve performance.
+
+By default, dpif_scalar is used. The DPIF implementation can be selected by
+name ::
+
+$ ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512
+DPIF implementation set to dpif_avx512.
+
+$ ovs-appctl dpif-netdev/dpif-impl-set dpif_scalar
+DPIF implementation set to dpif_scalar.
+
+Running Unit Tests with AVX512 DPIF
+~~~
+
+Since the AVX512 DPIF is disabled by default, a compile time option is
+available in order to test it with the OVS unit test suite. When building with
+a CPU that supports AVX512, use the following configure option ::
+
+$ ./configure --enable-dpif-default-avx512
+
+The following line should be seen in the configure output when the above option
+is used ::
+
+checking whether DPIF AVX512 is default implementation... yes
diff --git a/acinclude.m4 b/acinclude.m4
index 18c52f63a4..3433034475 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -30,6 +30,21 @@ AC_DEFUN([OVS_CHECK_DPCLS_AUTOVALIDATOR], [
   fi
 ])
 
+dnl Set OVS DPIF default implementation at configure time for running the unit
+dnl tests on the whole codebase without modifying tests per DPIF impl
+AC_DEFUN([OVS_CHECK_DPIF_AVX512_DEFAULT], [
+  AC_ARG_ENABLE([dpif-default-avx512],
+[AC_HELP_STRING([--enable-

[ovs-dev] [PATCH v16 01/10] dpif-netdev: Refactor to multiple header files.

2021-07-09 Thread Harry van Haaren

Split the very large file dpif-netdev.c and the datastructures
it contains into multiple header files. Each header file is
responsible for the datastructures of that component.

This logical split allows better reuse and modularity of the code,
and reduces the very large file dpif-netdev.c to be more managable.

Due to dependencies between components, it is not possible to
move component in smaller granularities than this patch.

To explain the dependencies better, eg:

DPCLS has no deps (from dpif-netdev.c file)
FLOW depends on DPCLS (struct dpcls_rule)
DFC depends on DPCLS (netdev_flow_key) and FLOW (netdev_flow_key)
THREAD depends on DFC (struct dfc_cache)

DFC_PROC depends on THREAD (struct pmd_thread)

DPCLS lookup.h/c require only DPCLS
DPCLS implementations require only dpif-netdev-lookup.h.
- This change was made in 2.12 release with function pointers
- This commit only refactors the name to "private-dpcls.h"

netdev_flow_key_equal_mf() is renamed to emc_flow_key_equal_mf().

Rename functions specific to dpcls from netdev_* namespace to the
dpcls_* namespace, as they are only used by dpcls code.

'inline' is added to the dp_netdev_flow_hash() when it is moved
definition to fix a compiler error.

One valid checkpatch issue with the use of the
EMC_FOR_EACH_POS_WITH_HASH() macro was fixed.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

Cc: Gaetan Rivet 
Cc: Sriharsha Basavapatna 

v16:
- Fix rebase conflicts on NEWS file

v15:
- Added Flavio's Acked-by tag.

v14:
- Make some functions in lib/dpif-netdev-private-dfc.c private as they
  aren't used in other files.
- Fix the order of includes to what is layed out in the coding-style.rst

v13:
- Add NEWS item in this commit rather than later.
- Add lib/dpif-netdev-private-dfc.c file and move non fast path dfc
  related functions there.
- Squash commit which renames functions specific to dpcls from netdev_*
  namespace to the dpcls_* namespace, as they are only used by dpcls
  code into this commit.
- Minor fixes from review comments.
---
 NEWS   |   1 +
 lib/automake.mk|   5 +
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c |   1 -
 lib/dpif-netdev-lookup-generic.c   |   1 -
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.c  | 110 +
 lib/dpif-netdev-private-dfc.h  | 164 
 lib/dpif-netdev-private-dpcls.h| 128 ++
 lib/dpif-netdev-private-flow.h | 163 
 lib/dpif-netdev-private-thread.h   | 206 ++
 lib/dpif-netdev-private.h  | 100 +
 lib/dpif-netdev.c  | 539 +
 13 files changed, 801 insertions(+), 620 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dfc.c
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-thread.h

diff --git a/NEWS b/NEWS
index 646a4224e2..38ad891b9c 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,7 @@ Post-v2.15.0
  * Added all-zero IP SNAT handling to conntrack. In case of collision,
using ct(src=0.0.0.0), the source port will be replaced with another
non-colliding port in the ephemeral range (1024, 65535).
+ * Refactor lib/dpif-netdev.c to multiple header files.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/automake.mk b/lib/automake.mk
index 1980bbeef1..8690bfb7a2 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -111,6 +111,11 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-lookup-generic.c \
lib/dpif-netdev.c \
lib/dpif-netdev.h \
+   lib/dpif-netdev-private-dfc.c \
+   lib/dpif-netdev-private-dfc.h \
+   lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
lib/dpif-netdev-perf.h \
diff --git a/lib/dpif-netdev-lookup-autovalidator.c 
b/lib/dpif-netdev-lookup-autovalidator.c
index 97b59fdd00..475e1ab1ec 100644
--- a/lib/dpif-netdev-lookup-autovalidator.c
+++ b/lib/dpif-netdev-lookup-autovalidator.c
@@ -17,7 +17,6 @@
 #include 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_autovalidator);
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 5e3634249d..8fc1cdfa53 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -21,7 +21,6 @@
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.

[ovs-dev] [PATCH v16 03/10] dpif-avx512: Add ISA implementation of dpif.

2021-07-09 Thread Harry van Haaren

This commit adds the AVX512 implementation of DPIF functionality,
specifically the dp_netdev_input_outer_avx512 function. This function
only handles outer (no re-circulations), and is optimized to use the
AVX512 ISA for packet batching and other DPIF work.

Sparse is not able to handle the AVX512 intrinsics, causing compile
time failures, so it is disabled for this file.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Co-authored-by: Kumar Amber 
Signed-off-by: Kumar Amber 
Acked-by: Flavio Leitner 

---

v15:
- Added Flavio's Acked-by tag.
- Fix minor spelling mistakes and formatting.
- Fix an issue with prefetching packets ahead in AVX512 DPIF with a
  batch size of 1.

v14:
- Fix the order of includes to what is layed out in the coding-style.rst
- Update PHWOL implementation to match what's used in the scalar DPIF.
  The scalar DPIF PHWOL implementation changed since v13.
- Use raw_ctz() to wrap __builtin_ctz(). This should fix Windows build
  errors.
- Remove unnecessary if (!f) check.
- Introduce hwol_emc_smc_missmask variable to save the lookup state
  before DPCLS lookup. This fixes an issue where the DPCLS lookup would
  modify hwol_emc_smc_hitmask before the EMC and SMC inserts could use
  it.
- Move dpcls_lookup prototype from lib/dpif-netdev-private-thread.h to
  lib/dpif-netdev-private-dpcls.h
- Fix a comment.
- Move addition of *netdev_input_func_userdata to struct
  dp_netdev_pmd_thread to this patch.
- Remove dp_netdev_input_outer_avx512() prototype from
  lib/dpif-netdev-private-thread.h since it already has a prototype in
  lib/dpif-netdev-private-dpif.h.
- Prefetch 2 packets ahead when processing in AVX512 DPIF. This was
  found to perform best when testing.
- Other minor rework from Flavio's review.

v13:
- Squash "Add HWOL support" commit into this commit.
- Add NEWS item about this feature here rather than in a later commit.
- Add #define NUM_U64_IN_ZMM_REG 8.
- Add comment describing operation of while loop handling HWOL->EMC->SMC
  lookups in dp_netdev_input_outer_avx512().
- Add EMC and SMC batch insert functions for better handling of EMC and
  SMC in AVX512 DPIF.
- Minor code refactor to address review comments.
---
 NEWS |   2 +
 lib/automake.mk  |   5 +-
 lib/dpif-netdev-avx512.c | 339 +++
 lib/dpif-netdev-private-dfc.h|  25 +++
 lib/dpif-netdev-private-dpcls.h  |   7 +
 lib/dpif-netdev-private-dpif.h   |  32 +++
 lib/dpif-netdev-private-thread.h |  15 +-
 lib/dpif-netdev-private.h|  21 +-
 lib/dpif-netdev.c| 105 --
 9 files changed, 533 insertions(+), 18 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dpif.h

diff --git a/NEWS b/NEWS
index 38ad891b9c..2625cabc85 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,8 @@ Post-v2.15.0
using ct(src=0.0.0.0), the source port will be replaced with another
non-colliding port in the ephemeral range (1024, 65535).
  * Refactor lib/dpif-netdev.c to multiple header files.
+ * Add avx512 implementation of dpif which can process non recirculated
+   packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
- ovs-ctl:
  * New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/automake.mk b/lib/automake.mk
index 8690bfb7a2..432b98e628 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
-mavx512f \
-mavx512bw \
-mavx512dq \
+   -mbmi \
-mbmi2 \
-fPIC \
$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
-   lib/dpif-netdev-lookup-avx512-gather.c
+   lib/dpif-netdev-lookup-avx512-gather.c \
+   lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
-static
 endif
@@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-private-dfc.c \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
new file mode 100644
index 00..f59c1bbe0b
--- /dev/null
+++ b/lib/dpif-netdev-avx512.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either exp

[ovs-dev] [PATCH v16 00/10] DPIF Framework + Optimizations

2021-07-09 Thread Harry van Haaren

e TODO item's in v3
--- Add "disable" implementation to return to scalar miniflow only
--- More fixes planned for v5/future revisions:
 Rename command to better reflect usage
 Improve dynamicness of patterns
 Add more demo protocols to show usage
- Future work
--- Documentation/NEWS items
--- Statistics for optimized MFEX
- Note that this patchset will be discussed/presented at OvsConf soon :)

v3 update summary:
(Cian Ferriter helping with rebases, review and code cleanups)
- Split out partially related changes (these will be sent separately)
--- netdev output action optimization
--- avx512 dpcls 16-block support optimization
- Squash commit which moves netdev struct flow into the refactor commit:
--- Squash dpif-netdev: move netdev flow struct to header
--- Into dpif-netdev: Refactor to multiple header files
- Implement Miniflow extract for AVX-512 DPIF
--- A generic method of matching patterns and packets is implemented,
providing traffic-pattern specific miniflow-extract acceleration.
--- The patterns today are hard-coded, however in a future patchset it
is intended to make these runtime configurable, allowing users to
optimize the SIMD miniflow extract for active traffic types.
- Notes:
--- 32 bit builds will be fixed in next release by adding flexible
miniflow extract optimization selection.
--- AVX-512 VBMI ISA is not yet supported in OVS due to requiring the
DPDK 20.11 update for RTE_CPUFLAG_*. Once on a newer DPDK this will
be added.

v2 updates:
- Includes DPIF command switching at runtime
- Includes AVX512 DPIF implementation
- Includes some partially related changes (can be split out of set?)
--- netdev output action optimization
--- avx512 dpcls 16-block support optimization


This patchset is a v7 for making the DPIF components of the
userspace datapath more flexible. It has been refactored to be
more modular to encourage code-reuse, and scalable in that ISA
optimized implementations can be added and selected at runtime.

The same approach as has been previously used for DPCLS is used
here, where a function pointer allows selection of an implementation
at runtime.

Datapath features such as EMC, SMC and HWOL are shared between
implementations, hence they are refactored into seperate header files.
The file splitting also improves maintainability, as dpif_netdev.c
has ~9000 LOC, and very hard to modify due to many structs defined
locally in the .c file, ruling out re-usability in other .c files.

Questions welcomed! Regards, -Harry


Cian Ferriter (1):
  dpif-netdev: Add a partial HWOL PMD statistic.

Harry van Haaren (9):
  dpif-netdev: Refactor to multiple header files.
  dpif-netdev: Add function pointer for netdev input.
  dpif-avx512: Add ISA implementation of dpif.
  dpif-netdev: Add command to switch dpif implementation.
  dpif-netdev: Add command to get dpif implementations.
  dpif-netdev/dpcls-avx512: Enable 16 block processing.
  dpif-netdev/dpcls: Specialize more subtable signatures.
  dpdk: Cache result of CPU ISA checks.
  dpcls-avx512: Enable avx512 vector popcount instruction.

 Documentation/topics/dpdk/bridge.rst   |  42 ++
 NEWS   |  12 +
 acinclude.m4   |  15 +
 configure.ac   |   1 +
 lib/automake.mk|  11 +-
 lib/dpdk.c |  29 +-
 lib/dpif-netdev-avx512.c   | 356 
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c | 294 +++---
 lib/dpif-netdev-lookup-generic.c   |   7 +-
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-perf.c |   3 +
 lib/dpif-netdev-perf.h |   1 +
 lib/dpif-netdev-private-dfc.c  | 110 
 lib/dpif-netdev-private-dfc.h  | 189 +++
 lib/dpif-netdev-private-dpcls.h| 135 +
 lib/dpif-netdev-private-dpif.c | 155 +
 lib/dpif-netdev-private-dpif.h |  79 +++
 lib/dpif-netdev-private-flow.h | 163 ++
 lib/dpif-netdev-private-thread.h   | 215 +++
 lib/dpif-netdev-private.h  | 117 +---
 lib/dpif-netdev-unixctl.man|   7 +
 lib/dpif-netdev.c  | 750 +++--
 tests/pmd.at   |   6 +-
 24 files changed, 1991 insertions(+), 709 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dfc.c
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-dpif.c
 create mode 100644 lib/dpif-netdev-private-dpif.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-thread.h

-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v16 02/10] dpif-netdev: Add function pointer for netdev input.

2021-07-09 Thread Harry van Haaren

This commit adds a function pointer to the pmd thread data structure,
giving the pmd thread flexibility in its dpif-input function choice.
This allows choosing of the implementation based on ISA capabilities
of the runtime CPU, leading to optimizations and higher performance.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
Acked-by: Flavio Leitner 

---

v15:
- Added Flavio's Acked-by tag.

v14:
- Add ATOMIC macro to netdev_input_func function pointer in struct
  dp_netdev_pmd_thread.

v13:
- Minor code refactor to address review comments.
---
 lib/dpif-netdev-private-thread.h | 10 ++
 lib/dpif-netdev.c|  7 ++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index 91f3753d19..d38a7a2c3f 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -47,6 +47,13 @@ struct dp_netdev_pmd_thread_ctx {
 uint32_t emc_insert_min;
 };
 
+/* Forward declaration for typedef. */
+struct dp_netdev_pmd_thread;
+
+typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t port_no);
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -101,6 +108,9 @@ struct dp_netdev_pmd_thread {
 /* Current context of the PMD thread. */
 struct dp_netdev_pmd_thread_ctx ctx;
 
+/* Function pointer to call for dp_netdev_input() functionality. */
+ATOMIC(dp_netdev_input_func) netdev_input_func;
+
 struct seq *reload_seq;
 uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e3a915a98f..ac36ae757b 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4286,8 +4286,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 }
+
 /* Process packet batch. */
-dp_netdev_input(pmd, , port_no);
+pmd->netdev_input_func(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
@@ -6088,6 +6089,10 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
*pmd, struct dp_netdev *dp,
 hmap_init(>tnl_port_cache);
 hmap_init(>send_port_cache);
 cmap_init(>tx_bonds);
+
+/* Initialize the DPIF function pointer to the default scalar version. */
+pmd->netdev_input_func = dp_netdev_input;
+
 /* init the 'flow_cache' since there is no
  * actual thread created for NON_PMD_CORE_ID. */
 if (core_id == NON_PMD_CORE_ID) {
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] configure/dpcls: allow opt-in to CPU ISA opts at compile time

2021-03-18 Thread Harry van Haaren

This commit allows "opt-in" to CPU ISA optimized implementations of
OVS SW datapath components at compile time. This can be useful in some
deployments where the CPU ISA optimized implementation is to be chosen
by default.

Note that only the default priority order of valid subtable
implementations is being modified. If a subtable implementation is
not available due to ISA not being available, it will not be selected.

With --enable-cpu-isa on an AVX512 capable CPU, the dpcls_avx512_gather
ISA optimized implementation of DPCLS is automatically enabled.

The default is off, so unless ./configure --enable-cpu-isa is passed,
the behaviour of the default OVS compile is not changed.

Signed-off-by: Harry van Haaren 
---
 acinclude.m4 | 14 ++
 configure.ac |  1 +
 lib/dpif-netdev-lookup.c |  8 +++-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/acinclude.m4 b/acinclude.m4
index 15a54d636..c8ab8cb89 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -30,6 +30,20 @@ AC_DEFUN([OVS_CHECK_DPCLS_AUTOVALIDATOR], [
   fi
 ])
 
+
+AC_DEFUN([OVS_CHECK_CPU_ISA_OPT_IN], [
+  AC_ARG_ENABLE([cpu-isa],
+[AC_HELP_STRING([--enable-cpu-isa], [Enable CPU ISA default 
enable.])],
+[isa=yes],[isa=no])
+  AC_MSG_CHECKING([whether CPU ISA should be enabled by default])
+  if test "$isa" != yes; then
+AC_MSG_RESULT([no])
+  else
+OVS_CFLAGS="$OVS_CFLAGS -DCPU_ISA_OPT_IN"
+AC_MSG_RESULT([yes])
+  fi
+])
+
 dnl OVS_ENABLE_WERROR
 AC_DEFUN([OVS_ENABLE_WERROR],
   [AC_ARG_ENABLE(
diff --git a/configure.ac b/configure.ac
index c077034d4..eb472a6b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -184,6 +184,7 @@ OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS 
-DHAVE_AVX512F"])
 OVS_ENABLE_WERROR
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
+OVS_CHECK_CPU_ISA_OPT_IN
 OVS_CHECK_DPCLS_AUTOVALIDATOR
 OVS_CHECK_BINUTILS_AVX512
 
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index bd0a99abe..0989c6a5f 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -45,7 +45,13 @@ static struct dpcls_subtable_lookup_info_t 
subtable_lookups[] = {
 
 #if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
 /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */
-{ .prio = 0,
+{
+#ifdef CPU_ISA_OPT_IN
+  /* Allow Autovalidator to override, but higher than default scalar. */
+  .prio = 100,
+#else
+  .prio = 0,
+#endif
   .probe = dpcls_subtable_avx512_gather_probe,
   .name = "avx512_gather", },
 #else
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 16/16] dpif-netdev: POC of future DPIF and MFEX AVX512 optimizations

2021-02-12 Thread Harry van Haaren

This is a POC patch, showing future DPIF and MFEX optimizations.

The main optimization is doing MiniflowExtract in AVX512. This speeds
up the specific protocol parsing a lot.

Other optimizations for DPIF show value in removing complexity from
the code by specialization. In particular if only DPCLS is enabled,
we can avoid rebatching packets.

Signed-off-by: Harry van Haaren 
---
 lib/automake.mk  |   1 +
 lib/dpdk.c   |   1 +
 lib/dpif-netdev-avx512.c | 178 +--
 lib/dpif-netdev-private-dpif.h   |   6 ++
 lib/dpif-netdev-private-thread.h |  10 ++
 lib/flow_avx512.h| 117 
 6 files changed, 255 insertions(+), 58 deletions(-)
 create mode 100644 lib/flow_avx512.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 5e493ebaf..a5dbf7f7e 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -137,6 +137,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/fatal-signal.h \
lib/flow.c \
lib/flow.h \
+   lib/flow_avx512.h \
lib/guarded-list.c \
lib/guarded-list.h \
lib/hash.c \
diff --git a/lib/dpdk.c b/lib/dpdk.c
index a9494a40f..a82ff04b6 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
 /* CPU flags only defined for the architecture that support it. */
 CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+CHECK_CPU_FEATURE(feature, "avx512vbmi", RTE_CPUFLAG_AVX512VBMI);
 CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
 CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index fff469e10..29b4b856a 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -35,6 +35,8 @@
 
 #include "immintrin.h"
 
+#include "flow_avx512.h"
+
 /* Structure to contain per-packet metadata that must be attributed to the
  * dp netdev flow. This is unfortunate to have to track per packet, however
  * it's a bit awkward to maintain them in a performant way. This structure
@@ -68,15 +70,24 @@ dp_netdev_input_outer_avx512_probe(void)
 return 0;
 }
 
-int32_t
-dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
- struct dp_packet_batch *packets,
- odp_port_t in_port)
+/* Specialize DPIF based on enabled options, eg for DPCLS only. */
+static inline ALWAYS_INLINE int32_t
+dp_netdev_input_outer_avx512_impl(struct dp_netdev_pmd_thread *pmd,
+  struct dp_packet_batch *packets,
+  odp_port_t in_port,
+  uint32_t dpcls_only)
 {
-/* Allocate DPIF userdata. */
 if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
 pmd->netdev_input_func_userdata =
 xmalloc_pagealign(sizeof(struct dpif_userdata));
+/* TODO: Enable MFEX selector/autovalidator as done for DPCLS.
+ *   This code shows the POC value, not final upstream code.
+ *   As the code uses AVX512-VBMI, check for ISA at runtime.
+ */
+int avx512vbmi = dpdk_get_cpu_has_isa("x86_64", "avx512vbmi");
+if (avx512vbmi) {
+pmd->mfex_func = mfex_avx512_ipv4_udp;
+}
 }
 
 struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
@@ -84,6 +95,14 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 struct netdev_flow_key **key_ptrs = ud->key_ptrs;
 struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
 
+/* TODO: make runtime command to allow users to disable/enable.
+ * Not all users need TCP-flags or bytes per rule, and it costs performance
+ * to always calculate it. Enabling this costs ~6 cycles/pkt. It will be
+ * enabled by default for consistency & backwards compat, but disabling
+ * could be investigated by users if they so desire.
+ */
+uint32_t do_pkt_meta = 1;
+
 /* Stores the computed output: a rule pointer for each packet */
 /* The AVX512 DPIF implementation handles rules in a way that is optimized
  * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
@@ -92,7 +111,8 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
  * array. Later the two arrays are merged by AVX-512 expand instructions.
  */
 struct dpcls_rule *rules[NETDEV_MAX_BURST];
-struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
+struct dpcls_rule *dpcls_rules_impl[NETDEV_MAX_BURST];
+struct dpcls_rule **dpcls_rules = dpcls_rules_impl;
 uint32_t dpcls_key_idx = 0;
 
 for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += 8) {
@@ -100,12 +120,8 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 _mm512_storeu_si512(_rules[i],

[ovs-dev] [PATCH v9 12/16] dpdk: Cache result of CPU ISA checks.

2021-02-12 Thread Harry van Haaren

As a small optimization, this patch caches the result of a CPU ISA
check from DPDK. Particularly in the case of running the DPCLS
autovalidator (which repeatedly probes subtables) this reduces
the amount of CPU ISA lookups from the DPDK level.

By caching them at the OVS/dpdk.c level, the ISA checks remain
runtime for the CPU where they are executed, but subsequent checks
for the same ISA feature become much cheaper.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v8: Add NEWS entry.
---
 NEWS   |  1 +
 lib/dpdk.c | 28 
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index a5bb16da2..0a093e582 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,7 @@ v2.15.0 - xx xxx 
- DPDK:
  * Removed support for vhost-user dequeue zero-copy.
  * Add support for DPDK 20.11.
+ * Cache results for CPU ISA checks, reduces overhead on repeated lookups.
- Userspace datapath:
  * Add the 'pmd' option to "ovs-appctl dpctl/dump-flows", which
restricts a flow dump to a single PMD thread if set.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394..c883a4b8b 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -614,13 +614,33 @@ print_dpdk_version(void)
 puts(rte_version());
 }
 
+/* Avoid calling rte_cpu_get_flag_enabled() excessively, by caching the
+ * result of the call for each CPU flag in a static variable. To avoid
+ * allocating large numbers of static variables, use a uint8 as a bitfield.
+ * Note the macro must only return if the ISA check is done and available.
+ */
+#define ISA_CHECK_DONE_BIT (1 << 0)
+#define ISA_AVAILABLE_BIT  (1 << 1)
+
 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG)   \
 do {\
 if (strncmp(feature, name_str, strlen(name_str)) == 0) {\
-int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
-VLOG_DBG("CPU flag %s, available %s\n", name_str,   \
-  has_isa ? "yes" : "no");  \
-return true;\
+static uint8_t isa_check_##RTE_CPUFLAG; \
+int check = isa_check_##RTE_CPUFLAG & ISA_CHECK_DONE_BIT;   \
+if (OVS_UNLIKELY(!check)) { \
+int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
+VLOG_DBG("CPU flag %s, available %s\n", \
+ name_str, has_isa ? "yes" : "no"); \
+isa_check_##RTE_CPUFLAG = ISA_CHECK_DONE_BIT;   \
+if (has_isa) {  \
+isa_check_##RTE_CPUFLAG |= ISA_AVAILABLE_BIT;   \
+}   \
+}   \
+if (isa_check_##RTE_CPUFLAG & ISA_AVAILABLE_BIT) {  \
+return true;\
+} else {\
+return false;   \
+}   \
 }   \
 } while (0)
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 05/16] dpif-avx512: Add HWOL support to avx512 dpif.

2021-02-12 Thread Harry van Haaren

Partial hardware offload is implemented in a very similar way to the
scalar dpif.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-avx512.c | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 10228aeb0..caba1fa1c 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -27,6 +27,7 @@
 #include "dpif-netdev-private-dpcls.h"
 #include "dpif-netdev-private-flow.h"
 #include "dpif-netdev-private-thread.h"
+#include "dpif-netdev-private-hwol.h"
 
 #include "dp-packet.h"
 #include "netdev.h"
@@ -111,9 +112,32 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 uint32_t i = __builtin_ctz(iter);
 iter = _blsr_u64(iter);
 
-/* Initialize packet md and do miniflow extract */
+/* Get packet pointer from bitmask and packet md */
 struct dp_packet *packet = packets->packets[i];
 pkt_metadata_init(>md, in_port);
+
+struct dp_netdev_flow *f = NULL;
+
+/* Check for partial hardware offload mark */
+uint32_t mark;
+if (dp_packet_has_flow_mark(packet, )) {
+f = mark_to_flow_find(pmd, mark);
+if (f) {
+rules[i] = >cr;
+
+/* This is nasty - instead of using the HWOL provided flow,
+ * parse the packet data anyway to find the location of the TCP
+ * header to extract the TCP flags for the rule.
+ */
+pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
+
+pkt_meta[i].bytes = dp_packet_size(packet);
+hwol_emc_smc_hitmask |= (1 << i);
+continue;
+}
+}
+
+/* Do miniflow extract into keys */
 struct netdev_flow_key *key = [i];
 miniflow_extract(packet, >mf);
 
@@ -124,8 +148,6 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 key->len = netdev_flow_key_size(miniflow_n_values(>mf));
 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >mf);
 
-struct dp_netdev_flow *f = NULL;
-
 if (emc_enabled) {
 f = emc_lookup(>emc_cache, key);
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 15/16] netdev: Optimize netdev_send_prepare_batch

2021-02-12 Thread Harry van Haaren

Optimize for the best case here where all packets will be compatible
with 'netdev_flags'.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v9: rebase 2
---
 NEWS |  2 ++
 lib/netdev.c | 31 ++-
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/NEWS b/NEWS
index 2ffc155f9..cbdcf53a1 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,8 @@ Post-v2.15.0
CPU supports it. This enhances performance by using the native vpopcount
instructions, instead of the emulated version of vpopcount.
  * Optimize dp_netdev_output by enhancing compiler optimization potential.
+ * Optimize netdev sending by assuming the happy case, and using fallback
+   for if the netdev doesnt meet the required HWOL needs of a packet.
 
 v2.15.0 - xx xxx 
 -
diff --git a/lib/netdev.c b/lib/netdev.c
index 91e91955c..29a5f1aa9 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -837,20 +837,33 @@ static void
 netdev_send_prepare_batch(const struct netdev *netdev,
   struct dp_packet_batch *batch)
 {
-struct dp_packet *packet;
-size_t i, size = dp_packet_batch_size(batch);
+struct dp_packet *p;
+uint32_t i, size = dp_packet_batch_size(batch);
+char *err_msg = NULL;
 
-DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-char *errormsg = NULL;
+for (i = 0; i < size; i++) {
+p = batch->packets[i];
+int pkt_ok = netdev_send_prepare_packet(netdev->ol_flags, p, _msg);
 
-if (netdev_send_prepare_packet(netdev->ol_flags, packet, )) {
-dp_packet_batch_refill(batch, packet, i);
+if (OVS_UNLIKELY(!pkt_ok)) {
+goto refill_loop;
+}
+}
+
+return;
+
+refill_loop:
+/* Loop through packets from the start of the batch again. This is the
+ * exceptional case where packets aren't compatible with 'netdev_flags'. */
+DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, p, batch) {
+if (netdev_send_prepare_packet(netdev->ol_flags, p, _msg)) {
+dp_packet_batch_refill(batch, p, i);
 } else {
-dp_packet_delete(packet);
+dp_packet_delete(p);
 COVERAGE_INC(netdev_send_prepare_drops);
 VLOG_WARN_RL(, "%s: Packet dropped: %s",
- netdev_get_name(netdev), errormsg);
-free(errormsg);
+ netdev_get_name(netdev), err_msg);
+free(err_msg);
 }
 }
 }
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 14/16] dpif-netdev: Optimize dp output action

2021-02-12 Thread Harry van Haaren

This commit optimizes the output action, by enabling the compiler to
optimize the code better through reducing code complexity.

The core concept of this optimization is that the array-length checks
have already been performed above the copying code, so can be removed.
Removing of the per-packet length checks allows the compiler to auto-vectorize
the stores using SIMD registers.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS  |  1 +
 lib/dpif-netdev.c | 23 ++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index 5f1e3b5e0..2ffc155f9 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,7 @@ Post-v2.15.0
  * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
CPU supports it. This enhances performance by using the native vpopcount
instructions, instead of the emulated version of vpopcount.
+ * Optimize dp_netdev_output by enhancing compiler optimization potential.
 
 v2.15.0 - xx xxx 
 -
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5e83755d7..b2cf1bd46 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7254,12 +7254,25 @@ dp_execute_output_action(struct dp_netdev_pmd_thread 
*pmd,
 pmd->n_output_batches++;
 }
 
-struct dp_packet *packet;
-DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-p->output_pkts_rxqs[dp_packet_batch_size(>output_pkts)] =
-pmd->ctx.last_rxq;
-dp_packet_batch_add(>output_pkts, packet);
+/* The above checks ensure that there is enough space in the output batch.
+ * Using dp_packet_batch_add() has a branch to check if the batch is full.
+ * This branch reduces the compiler's ability to optimize efficiently. The
+ * below code implements packet movement between batches without checks,
+ * with the required semantics of output batch perhaps containing packets.
+ */
+int batch_size = dp_packet_batch_size(packets_);
+int out_batch_idx = dp_packet_batch_size(>output_pkts);
+struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
+struct dp_packet_batch *output_batch = >output_pkts;
+
+for (int i = 0; i < batch_size; i++) {
+struct dp_packet *packet = packets_->packets[i];
+p->output_pkts_rxqs[out_batch_idx] = rxq;
+output_batch->packets[out_batch_idx] = packet;
+out_batch_idx++;
 }
+output_batch->count += batch_size;
+
 return true;
 }
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 11/16] dpif-netdev/dpcls: specialize more subtable signatures.

2021-02-12 Thread Harry van Haaren

This commit adds more subtables to be specialized. The traffic
pattern here being matched is VXLAN traffic subtables, which commonly
have (5,3), (9,1) and (9,4) subtable fingerprints.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS   | 2 ++
 lib/dpif-netdev-lookup-avx512-gather.c | 6 ++
 lib/dpif-netdev-lookup-generic.c   | 6 ++
 3 files changed, 14 insertions(+)

diff --git a/NEWS b/NEWS
index d3b9221ed..a5bb16da2 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,8 @@ Post-v2.15.0
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
  * Add commands to get and set the dpif implementations.
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
+ * Add more specialized DPCLS subtables to cover common rules, enhancing
+   the lookup performance.
 
 v2.15.0 - xx xxx 
 -
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 1f27c0536..3a684fadf 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -299,6 +299,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -331,6 +334,9 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, 
uint32_t u1_bits)
 return NULL;
 }
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index e3b6be4b6..6c74ac3a1 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -282,6 +282,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable 
*subtable,
 return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1);  \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -303,6 +306,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t 
u1_bits)
 {
 dpcls_subtable_lookup_func f = NULL;
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 13/16] dpcls-avx512: enabling avx512 vector popcount instruction.

2021-02-12 Thread Harry van Haaren

This commit enables the AVX512-VPOPCNTDQ Vector Popcount
instruction. This instruction is not available on every CPU
that supports the AVX512-F Foundation ISA, hence it is enabled
only when the additional VPOPCNTDQ ISA check is passed.

The vector popcount instruction is used instead of the AVX512
popcount emulation code present in the avx512 optimized DPCLS today.
It provides higher performance in the SIMD miniflow processing
as that requires the popcount to calculate the miniflow block indexes.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS   |  3 +
 lib/dpdk.c |  1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 84 --
 3 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/NEWS b/NEWS
index 0a093e582..5f1e3b5e0 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,9 @@ Post-v2.15.0
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
  * Add more specialized DPCLS subtables to cover common rules, enhancing
the lookup performance.
+ * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
+   CPU supports it. This enhances performance by using the native vpopcount
+   instructions, instead of the emulated version of vpopcount.
 
 v2.15.0 - xx xxx 
 -
diff --git a/lib/dpdk.c b/lib/dpdk.c
index c883a4b8b..a9494a40f 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
 /* CPU flags only defined for the architecture that support it. */
 CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
 CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
 
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 3a684fadf..9a3273dc6 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,6 +53,15 @@
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+return _mm512_popcnt_epi64(v_in);
+}
+
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
  __mmask64 u1_bcast_msk,  /* mask of u1 lanes */
  const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
  __mmask64 zero_mask, /* maskz if pkt not have mf bit */
- __mmask64 u64_lanes_mask) /* total lane count to use */
+ __mmask64 u64_lanes_mask, /* total lane count to use */
+ const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
 {
 /* Suggest to compiler to load tbl blocks ahead of gather() */
 __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
@@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
   tbl_mf_masks);
 __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
 
-/* Manual AVX512 popcount for u64 lanes. */
-__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+/* Calculate AVX512 popcount for u64 lanes using the native instruction
+ * if available, or using emulation if not available.
+ */
+__m512i v_popcnts;
+if (use_vpop) {
+v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
+} else {
+v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+}
 
 /* Add popcounts and offset for u1 bits. */
 __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
@@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules,
const uint32_t bit_count_u0,
-   const uint32_t bit_count_u1)
+   const uint32_t bit_count_u1,
+   const uint32_t use_vpop)
 {
 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
 uint32_t hashes[NETDEV_MAX_BURST];
@@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 u1_bcast_mask,
 pkt_mf_u0_pop,
 zero_mask,
-bit_count_total_mask);
+bit_count_total_mask,
+use_vpop);
 _mm512_storeu_si512(_c

[ovs-dev] [PATCH v9 10/16] dpif-netdev/dpcls-avx512: enable 16 block processing.

2021-02-12 Thread Harry van Haaren

This commit implements larger subtable searches in avx512. A limitation
of the previous implementation was that up to 8 blocks of miniflow
data could be matched on (so a subtable with 8 blocks was handled
in avx, but 9 blocks or more would fall back to scalar/generic).
This limitation is removed in this patch, where up to 16 blocks
of subtable can be matched on.

>From an implementation perspective, the key to enabling 16 blocks
over 8 blocks was to do bitmask calculation up front, and then use
the pre-calculated bitmasks for 2x passes of the "blocks gather"
routine. The bitmasks need to be shifted for k-mask usage in the
upper (8-15) block range, but it is relatively trivial. This also
helps in case expanding to 24 blocks is desired in future.

The implementation of the 2nd iteration to handle > 8 blocks is
behind a conditional branch which checks the total number of bits.
This helps the specialized versions of the function that have a
miniflow fingerprint of less-than-or-equal 8 blocks, as the code
can be statically stripped out of those functions. Specialized
functions that do require more than 8 blocks will have the branch
removed and unconditionally execute the 2nd blocks gather routine.

Lastly, the _any() flavour will have the conditional branch, and
the branch predictor may mispredict a bit, but per burst will
likely get most packets correct (particularly towards the middle
and end of a burst).

The code has been run with unit tests under autovalidation and
passes all cases, and unit test coverage has been checked to
ensure the 16 block code paths are executing.

Signed-off-by: Harry van Haaren 

---

v9: Fixup post 2.15 rebase on NEWS
v8: Add NEWS entry
---
 NEWS   |   1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 203 ++---
 2 files changed, 147 insertions(+), 57 deletions(-)

diff --git a/NEWS b/NEWS
index a03e9d7be..d3b9221ed 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,7 @@ Post-v2.15.0
  * Add avx512 implementation of dpif which can process non recirculated
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
  * Add commands to get and set the dpif implementations.
+ * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
 
 v2.15.0 - xx xxx 
 -
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 8fc1cdfa5..1f27c0536 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -34,7 +34,21 @@
  * AVX512 code at a time.
  */
 #define NUM_U64_IN_ZMM_REG (8)
-#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+/* This implementation of AVX512 gather allows up to 16 blocks of MF data to be
+ * present in the blocks_cache, hence the multiply by 2 in the blocks count.
+ */
+#define MF_BLOCKS_PER_PACKET (NUM_U64_IN_ZMM_REG * 2)
+
+/* Blocks cache size is the maximum number of miniflow blocks that this
+ * implementation of lookup can handle.
+ */
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * MF_BLOCKS_PER_PACKET)
+
+/* The gather instruction can handle a scale for the size of the items to
+ * gather. For uint64_t data, this scale is 8.
+ */
+#define GATHER_SCALE_8 (8)
 
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
@@ -69,22 +83,83 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
 {
 const uint64_t *keyp = miniflow_get_values(>flow.mf);
 const uint64_t *maskp = miniflow_get_values(>mask->mf);
-const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+const uint32_t lane_mask = (1ULL << mf_bits_total) - 1;
 
 /* Always load a full cache line from blocks_cache. Other loads must be
  * trimmed to the amount of data required for mf_bits_total blocks.
  */
-__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
-__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
-__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+uint32_t res_mask;
+
+{
+__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+}
 
-__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
-uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+if (mf_bits_total > 8) {
+uint32_t lane_mask_gt8 = lane_mask >> 8;
+__m512i v_blocks = _mm512_loadu_si512(_cache[8]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+uint32_t c = _mm512_mask_cmpeq_epi64_mask(lane_mask_gt8, v_data,
+

[ovs-dev] [PATCH v9 08/16] docs/dpdk/bridge: Add dpif performance section.

2021-02-12 Thread Harry van Haaren

From: Cian Ferriter 

This section details how two new commands can be used to list and select
the different dpif implementations. It also details how a non default
dpif implementation can be tested with the OVS unit test suite.

Add NEWS updates for the dpif-netdev.c refactor and the new dpif
implementations/commands.

Signed-off-by: Cian Ferriter 

---

v8:
- Merge NEWS file items into one Userspace Datapath: heading
---
 Documentation/topics/dpdk/bridge.rst | 37 
 NEWS |  6 -
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 526d5c959..ca90d7bdb 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -214,3 +214,40 @@ implementation ::
 
 Compile OVS in debug mode to have `ovs_assert` statements error out if
 there is a mis-match in the DPCLS lookup implementation.
+
+Datapath Interface Performance
+--
+
+The datapath interface (DPIF) or dp_netdev_input() is responsible for taking
+packets through the major components of the userspace datapath; such as
+miniflow_extract, EMC, SMC and DPCLS lookups, and a lot of the performance
+stats associated with the datapath.
+
+Just like with the SIMD DPCLS work above, SIMD can be applied to the DPIF to
+improve performance.
+
+OVS provides multiple implementations of the DPIF. These can be listed with the
+following command ::
+
+$ ovs-appctl dpif-netdev/dpif-get
+Available DPIF implementations:
+  dpif_scalar
+  dpif_avx512
+
+By default, dpif_scalar is used. The DPIF implementation can be selected by
+name ::
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_avx512
+DPIF implementation set to dpif_avx512.
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_scalar
+DPIF implementation set to dpif_scalar.
+
+Running Unit Tests with AVX512 DPIF
+~~~
+
+Since the AVX512 DPIF is disabled by default, a compile time option is
+available in order to test it with the OVS unit test suite. When building with
+a CPU that supports AVX512, use the following configure option ::
+
+$ ./configure --enable-dpif-default-avx512
diff --git a/NEWS b/NEWS
index a7bffce97..a03e9d7be 100644
--- a/NEWS
+++ b/NEWS
@@ -2,7 +2,11 @@ Post-v2.15.0
 -
- In ovs-vsctl and vtep-ctl, the "find" command now accept new
  operators {in} and {not-in}.
-
+   - Userspace Datapath:
+ * Refactor lib/dpif-netdev.c to multiple header files.
+ * Add avx512 implementation of dpif which can process non recirculated
+   packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
+ * Add commands to get and set the dpif implementations.
 
 v2.15.0 - xx xxx 
 -
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 07/16] dpif-netdev: Add command to get dpif implementations.

2021-02-12 Thread Harry van Haaren

This commit adds a new command to retrieve the list of available
DPIF implementations. This can be used by to check what implementations
of the DPIF are available in any given OVS binary.

Usage:
 $ ovs-appctl dpif-netdev/dpif-get

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpif.c |  7 +++
 lib/dpif-netdev-private-dpif.h |  6 ++
 lib/dpif-netdev.c  | 24 
 3 files changed, 37 insertions(+)

diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
index 9e1f3b8f9..c5021fe9f 100644
--- a/lib/dpif-netdev-private-dpif.c
+++ b/lib/dpif-netdev-private-dpif.c
@@ -61,6 +61,13 @@ dp_netdev_impl_get_default(void)
 return func;
 }
 
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls)
+{
+ovs_assert(out_impls);
+*out_impls = dpif_impls;
+return ARRAY_SIZE(dpif_impls);
+}
 
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
index a09f90acc..99fbda943 100644
--- a/lib/dpif-netdev-private-dpif.h
+++ b/lib/dpif-netdev-private-dpif.h
@@ -47,6 +47,12 @@ struct dpif_netdev_impl_info_t {
 const char *name;
 };
 
+/* This function returns all available implementations to the caller. The
+ * quantity of implementations is returned by the int return value.
+ */
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls);
+
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
  */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 564d94a97..dff844f99 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -992,6 +992,27 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, 
int argc,
 ds_destroy();
 }
 
+static void
+dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
+ const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+const struct dpif_netdev_impl_info_t *dpif_impls;
+uint32_t count = dp_netdev_impl_get(_impls);
+if (count == 0) {
+unixctl_command_reply_error(conn, "error getting dpif names");
+return;
+}
+
+/* Add all dpif functions to reply string. */
+struct ds reply = DS_EMPTY_INITIALIZER;
+ds_put_cstr(, "Available DPIF implementations:\n");
+for (uint32_t i = 0; i < count; i++) {
+ds_put_format(, "  %s\n", dpif_impls[i].name);
+}
+unixctl_command_reply(conn, ds_cstr());
+ds_destroy();
+}
+
 static void
 dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
  const char *argv[], void *aux OVS_UNUSED)
@@ -1290,6 +1311,9 @@ dpif_netdev_init(void)
  "[dpif implementation name] [dp]",
  1, 2, dpif_netdev_impl_set,
  NULL);
+unixctl_command_register("dpif-netdev/dpif-get", "",
+ 0, 0, dpif_netdev_impl_get,
+ NULL);
 return 0;
 }
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 09/16] dpif-netdev/dpcls: Refactor function names to dpcls.

2021-02-12 Thread Harry van Haaren

This commit refactors the function names from netdev_*
namespace to the dpcls_* namespace, as they are only used
by dpcls code. With the name change, it becomes more obvious
that the functions belong to dpcls functionality, and in the
dpif-netdev-private-dpcls.h header file.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpcls.h |  6 ++
 lib/dpif-netdev.c   | 21 ++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/lib/dpif-netdev-private-dpcls.h b/lib/dpif-netdev-private-dpcls.h
index 5bc579bba..e66cae3f4 100644
--- a/lib/dpif-netdev-private-dpcls.h
+++ b/lib/dpif-netdev-private-dpcls.h
@@ -97,10 +97,8 @@ struct dpcls_subtable {
 
 /* Generates a mask for each bit set in the subtable's miniflow. */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1);
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, uint64_t *mf_masks,
+ const uint32_t mf_bits_u0, const uint32_t mf_bits_u1);
 
 /* Matches a dpcls rule against the incoming packet in 'target' */
 bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index dff844f99..5e83755d7 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8278,7 +8278,7 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
 subtable->mf_bits_set_unit0 = unit0;
 subtable->mf_bits_set_unit1 = unit1;
 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
-netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
+dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
 
 /* Get the preferred subtable search function for this (u0,u1) subtable.
  * The function is guaranteed to always return a valid implementation, and
@@ -8453,11 +8453,10 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 }
 }
 
-/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
+/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
 static inline void
-netdev_flow_key_gen_mask_unit(uint64_t iter,
-  const uint64_t count,
-  uint64_t *mf_masks)
+dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
+ uint64_t *mf_masks)
 {
 int i;
 for (i = 0; i < count; i++) {
@@ -8478,16 +8477,16 @@ netdev_flow_key_gen_mask_unit(uint64_t iter,
  * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
  */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1)
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+ uint64_t *mf_masks,
+ const uint32_t mf_bits_u0,
+ const uint32_t mf_bits_u1)
 {
 uint64_t iter_u0 = tbl->mf.map.bits[0];
 uint64_t iter_u1 = tbl->mf.map.bits[1];
 
-netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
-netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
+dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
+dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
 }
 
 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 06/16] dpif-netdev: Add command to switch dpif implementation.

2021-02-12 Thread Harry van Haaren

This commit adds a new command to allow the user to switch
the active DPIF implementation at runtime. A probe function
is executed before switching the DPIF implementation, to ensure
the CPU is capable of running the ISA required. For example, the
below code will switch to the AVX512 enabled DPIF assuming
that the runtime CPU is capable of running AVX512 instructions:

 $ ovs-appctl dpif-netdev/dpif-set dpif_avx512

A new configuration flag is added to allow selection of the
default DPIF. This is useful for running the unit-tests against
the available DPIF implementations, without modifying each unit test.

The design of the testing & validation for ISA optimized DPIF
implementations is based around the work already upstream for DPCLS.
Note however that a DPCLS lookup has no state or side-effects, allowing
the auto-validator implementation to perform multiple lookups and
provide consistent statistic counters.

The DPIF component does have state, so running two implementations in
parallel and comparing output is not a valid testing method, as there
are changes in DPIF statistic counters (side effects). As a result, the
DPIF is tested directly against the unit-tests.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 acinclude.m4 | 15 ++
 configure.ac |  1 +
 lib/automake.mk  |  1 +
 lib/dpif-netdev-avx512.c | 14 +
 lib/dpif-netdev-private-dpif.c   | 92 
 lib/dpif-netdev-private-dpif.h   | 43 ++-
 lib/dpif-netdev-private-thread.h | 12 +
 lib/dpif-netdev.c| 86 +++--
 8 files changed, 248 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dpif.c

diff --git a/acinclude.m4 b/acinclude.m4
index 435685c93..c9b0d56d6 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -30,6 +30,21 @@ AC_DEFUN([OVS_CHECK_DPCLS_AUTOVALIDATOR], [
   fi
 ])
 
+dnl Set OVS DPIF default implementation at configure time for running the unit
+dnl tests on the whole codebase without modifying tests per DPIF impl
+AC_DEFUN([OVS_CHECK_DPIF_AVX512_DEFAULT], [
+  AC_ARG_ENABLE([dpif-default-avx512],
+[AC_HELP_STRING([--enable-dpif-default-avx512], [Enable DPIF 
AVX512 implementation as default.])],
+[dpifavx512=yes],[dpifavx512=no])
+  AC_MSG_CHECKING([whether DPIF AVX512 is default implementation])
+  if test "$dpifavx512" != yes; then
+AC_MSG_RESULT([no])
+  else
+OVS_CFLAGS="$OVS_CFLAGS -DDPIF_AVX512_DEFAULT"
+AC_MSG_RESULT([yes])
+  fi
+])
+
 dnl OVS_ENABLE_WERROR
 AC_DEFUN([OVS_ENABLE_WERROR],
   [AC_ARG_ENABLE(
diff --git a/configure.ac b/configure.ac
index c077034d4..e45685a6c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -185,6 +185,7 @@ OVS_ENABLE_WERROR
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
 OVS_CHECK_DPCLS_AUTOVALIDATOR
+OVS_CHECK_DPIF_AVX512_DEFAULT
 OVS_CHECK_BINUTILS_AVX512
 
 AC_ARG_VAR(KARCH, [Kernel Architecture String])
diff --git a/lib/automake.mk b/lib/automake.mk
index d945d935e..5e493ebaf 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -115,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.c \
lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index caba1fa1c..fff469e10 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -19,6 +19,7 @@
 #if !defined(__CHECKER__)
 
 #include 
+#include 
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-perf.h"
@@ -54,6 +55,19 @@ struct dpif_userdata {
 struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
 };
 
+int32_t
+dp_netdev_input_outer_avx512_probe(void)
+{
+int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+
+if (!avx512f_available || !bmi2_available) {
+return -ENOTSUP;
+}
+
+return 0;
+}
+
 int32_t
 dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
  struct dp_packet_batch *packets,
diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
new file mode 100644
index 0..9e1f3b8f9
--- /dev/null
+++ b/lib/dpif-netdev-private-dpif.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licens

[ovs-dev] [PATCH v9 02/16] dpif-netdev: Split HWOL out to own header file.

2021-02-12 Thread Harry van Haaren

This commit moves the datapath lookup functions required for
hardware offload to a seperate file. This allows other DPIF
implementations to access the lookup functions, encouraging
code reuse.

Signed-off-by: Harry van Haaren 
---
 lib/automake.mk|  1 +
 lib/dpif-netdev-private-hwol.h | 63 ++
 lib/dpif-netdev.c  | 39 ++---
 3 files changed, 67 insertions(+), 36 deletions(-)
 create mode 100644 lib/dpif-netdev-private-hwol.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 0e83145b5..9b3e06db6 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
diff --git a/lib/dpif-netdev-private-hwol.h b/lib/dpif-netdev-private-hwol.h
new file mode 100644
index 0..447010ab8
--- /dev/null
+++ b/lib/dpif-netdev-private-hwol.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_PRIVATE_HWOL_H
+#define DPIF_NETDEV_PRIVATE_HWOL_H 1
+
+#include "dpif-netdev-private-flow.h"
+
+#define MAX_FLOW_MARK   (UINT32_MAX - 1)
+#define INVALID_FLOW_MARK   0
+/* Zero flow mark is used to indicate the HW to remove the mark. A packet
+ * marked with zero mark is received in SW without a mark at all, so it
+ * cannot be used as a valid mark.
+ */
+
+struct megaflow_to_mark_data {
+const struct cmap_node node;
+ovs_u128 mega_ufid;
+uint32_t mark;
+};
+
+struct flow_mark {
+struct cmap megaflow_to_mark;
+struct cmap mark_to_flow;
+struct id_pool *pool;
+};
+
+/* allocated in dpif-netdev.c */
+extern struct flow_mark flow_mark;
+
+static inline struct dp_netdev_flow *
+mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
+  const uint32_t mark)
+{
+struct dp_netdev_flow *flow;
+
+CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
+ _mark.mark_to_flow) {
+if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
+flow->dead == false) {
+return flow;
+}
+}
+
+return NULL;
+}
+
+
+#endif /* dpif-netdev-private-hwol.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 395a5c29d..840298f01 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -84,6 +84,8 @@
 #include "util.h"
 #include "uuid.h"
 
+#include "dpif-netdev-private-hwol.h"
+
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
 /* Auto Load Balancing Defaults */
@@ -1953,26 +1955,8 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread 
*pmd,
 return cls;
 }
 
-#define MAX_FLOW_MARK   (UINT32_MAX - 1)
-#define INVALID_FLOW_MARK   0
-/* Zero flow mark is used to indicate the HW to remove the mark. A packet
- * marked with zero mark is received in SW without a mark at all, so it
- * cannot be used as a valid mark.
- */
-
-struct megaflow_to_mark_data {
-const struct cmap_node node;
-ovs_u128 mega_ufid;
-uint32_t mark;
-};
-
-struct flow_mark {
-struct cmap megaflow_to_mark;
-struct cmap mark_to_flow;
-struct id_pool *pool;
-};
 
-static struct flow_mark flow_mark = {
+struct flow_mark flow_mark = {
 .megaflow_to_mark = CMAP_INITIALIZER,
 .mark_to_flow = CMAP_INITIALIZER,
 };
@@ -2141,23 +2125,6 @@ flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
 }
 }
 
-static struct dp_netdev_flow *
-mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
-  const uint32_t mark)
-{
-struct dp_netdev_flow *flow;
-
-CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
- _mark.mark_to_flow) {
-if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
-flow->dead == false) {
-return flow;
-}
-}
-
-return NULL;
-}
-
 static struct dp_flow_offload_item *
 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
  struct dp_netdev_flow *flow,
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 04/16] dpif-avx512: Add ISA implementation of dpif.

2021-02-12 Thread Harry van Haaren

This commit adds the AVX512 implementation of DPIF functionality,
specifically the dp_netdev_input_outer_avx512 function. This function
only handles outer (no re-circulations), and is optimized to use the
AVX512 ISA for packet batching and other DPIF work.

Sparse is not able to handle the AVX512 intrinsics, causing compile
time failures, so it is disabled for this file.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v8:
- Fixup AVX512 mask to uint32_t conversion compilation warning.
---
 lib/automake.mk  |   5 +-
 lib/dpif-netdev-avx512.c | 264 +++
 lib/dpif-netdev-private-dfc.h|   8 +
 lib/dpif-netdev-private-dpif.h   |  32 
 lib/dpif-netdev-private-thread.h |  11 +-
 lib/dpif-netdev-private.h|  25 +++
 lib/dpif-netdev.c|  70 ++--
 7 files changed, 399 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dpif.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 9b3e06db6..d945d935e 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
-mavx512f \
-mavx512bw \
-mavx512dq \
+   -mbmi \
-mbmi2 \
-fPIC \
$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
-   lib/dpif-netdev-lookup-avx512-gather.c
+   lib/dpif-netdev-lookup-avx512-gather.c \
+   lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
-static
 endif
@@ -113,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
new file mode 100644
index 0..10228aeb0
--- /dev/null
+++ b/lib/dpif-netdev-avx512.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions */
+#if !defined(__CHECKER__)
+
+#include 
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-perf.h"
+
+#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-flow.h"
+#include "dpif-netdev-private-thread.h"
+
+#include "dp-packet.h"
+#include "netdev.h"
+
+#include "immintrin.h"
+
+/* Structure to contain per-packet metadata that must be attributed to the
+ * dp netdev flow. This is unfortunate to have to track per packet, however
+ * it's a bit awkward to maintain them in a performant way. This structure
+ * helps to keep two variables on a single cache line per packet.
+ */
+struct pkt_flow_meta {
+uint16_t bytes;
+uint16_t tcp_flags;
+};
+
+/* Structure of heap allocated memory for DPIF internals. */
+struct dpif_userdata {
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key keys[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
+};
+
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t in_port)
+{
+/* Allocate DPIF userdata. */
+if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
+pmd->netdev_input_func_userdata =
+xmalloc_pagealign(sizeof(struct dpif_userdata));
+}
+
+struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
+struct netdev_flow_key *keys = ud->keys;
+struct netdev_flow_key **key_ptrs = ud->key_ptrs;
+struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
+
+/* Stores the computed output: a rule pointer for each packet */
+/* The AVX512 DPIF implementation handles rules in a way that is optimized
+ * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
+ * achieved by separating the rule arrays. Bitmasks are kept for each
+ * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
+

[ovs-dev] [PATCH v9 01/16] dpif-netdev: Refactor to multiple header files.

2021-02-12 Thread Harry van Haaren

Split the very large file dpif-netdev.c and the datastructures
it contains into multiple header files. Each header file is
responsible for the datastructures of that component.

This logical split allows better reuse and modularity of the code,
and reduces the very large file dpif-netdev.c to be more managable.

Due to dependencies between components, it is not possible to
move component in smaller granularities than this patch.

To explain the dependencies better, eg:

DPCLS has no deps (from dpif-netdev.c file)
FLOW depends on DPCLS (struct dpcls_rule)
DFC depends on DPCLS (netdev_flow_key) and FLOW (netdev_flow_key)
THREAD depends on DFC (struct dfc_cache)

DFC_PROC depends on THREAD (struct pmd_thread)

DPCLS lookup.h/c require only DPCLS
DPCLS implementations require only dpif-netdev-lookup.h.
- This change was made in 2.12 release with function pointers
- This commit only refactors the name to "private-dpcls.h"

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 lib/automake.mk|   4 +
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c |   1 -
 lib/dpif-netdev-lookup-generic.c   |   1 -
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.h  | 244 
 lib/dpif-netdev-private-dpcls.h| 129 ++
 lib/dpif-netdev-private-flow.h | 162 
 lib/dpif-netdev-private-thread.h   | 206 ++
 lib/dpif-netdev-private.h  | 100 +
 lib/dpif-netdev.c  | 519 +
 11 files changed, 760 insertions(+), 609 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-thread.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 39afbff9d..0e83145b5 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -111,6 +111,10 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-lookup-generic.c \
lib/dpif-netdev.c \
lib/dpif-netdev.h \
+   lib/dpif-netdev-private-dfc.h \
+   lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
lib/dpif-netdev-perf.h \
diff --git a/lib/dpif-netdev-lookup-autovalidator.c 
b/lib/dpif-netdev-lookup-autovalidator.c
index 97b59fdd0..475e1ab1e 100644
--- a/lib/dpif-netdev-lookup-autovalidator.c
+++ b/lib/dpif-netdev-lookup-autovalidator.c
@@ -17,7 +17,6 @@
 #include 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_autovalidator);
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 5e3634249..8fc1cdfa5 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -21,7 +21,6 @@
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "cmap.h"
 #include "flow.h"
 #include "pvector.h"
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index b1a0cfc36..e3b6be4b6 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -17,7 +17,6 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
 #include "dpif-netdev-lookup.h"
 
 #include "bitmap.h"
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index bd72aa29b..59f51faa0 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -19,7 +19,7 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
 
 /* Function to perform a probe for the subtable bit fingerprint.
  * Returns NULL if not valid, or a valid function pointer to call for this
diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
new file mode 100644
index 0..8f6a4899e
--- /dev/null
+++ b/lib/dpif-netdev-private-dfc.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2019, 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implie

[ovs-dev] [PATCH v9 03/16] dpif-netdev: Add function pointer for netdev input.

2021-02-12 Thread Harry van Haaren

This commit adds a function pointer to the pmd thread data structure,
giving the pmd thread flexibility in its dpif-input function choice.
This allows choosing of the implementation based on ISA capabilities
of the runtime CPU, leading to optimizations and higher performance.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-thread.h | 12 
 lib/dpif-netdev.c|  7 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index a5b3ae360..089223aaf 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -47,6 +47,13 @@ struct dp_netdev_pmd_thread_ctx {
 uint32_t emc_insert_min;
 };
 
+/* Forward declaration for typedef */
+struct dp_netdev_pmd_thread;
+
+typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t port_no);
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -101,6 +108,11 @@ struct dp_netdev_pmd_thread {
 /* Current context of the PMD thread. */
 struct dp_netdev_pmd_thread_ctx ctx;
 
+/* Function pointer to call for dp_netdev_input() functionality. */
+dp_netdev_input_func netdev_input_func;
+/* Pointer for per-DPIF implementation scratch space. */
+void *netdev_input_func_userdata;
+
 struct seq *reload_seq;
 uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 840298f01..c0cf44852 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4220,8 +4220,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 }
+
 /* Process packet batch. */
-dp_netdev_input(pmd, , port_no);
+pmd->netdev_input_func(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
@@ -6005,6 +6006,10 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
*pmd, struct dp_netdev *dp,
 hmap_init(>tnl_port_cache);
 hmap_init(>send_port_cache);
 cmap_init(>tx_bonds);
+
+/* Initialize the DPIF function pointer to the default scalar version */
+pmd->netdev_input_func = dp_netdev_input;
+
 /* init the 'flow_cache' since there is no
  * actual thread created for NON_PMD_CORE_ID. */
 if (core_id == NON_PMD_CORE_ID) {
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v9 00/16] DPIF Framework + Optimizations

2021-02-12 Thread Harry van Haaren

ce they are refactored into seperate header files.
The file splitting also improves maintainability, as dpif_netdev.c
has ~9000 LOC, and very hard to modify due to many structs defined
locally in the .c file, ruling out re-usability in other .c files.

Questions welcomed! Regards, -Harry


Cian Ferriter (1):
  docs/dpdk/bridge: Add dpif performance section.

Harry van Haaren (15):
  dpif-netdev: Refactor to multiple header files.
  dpif-netdev: Split HWOL out to own header file.
  dpif-netdev: Add function pointer for netdev input.
  dpif-avx512: Add ISA implementation of dpif.
  dpif-avx512: Add HWOL support to avx512 dpif.
  dpif-netdev: Add command to switch dpif implementation.
  dpif-netdev: Add command to get dpif implementations.
  dpif-netdev/dpcls: Refactor function names to dpcls.
  dpif-netdev/dpcls-avx512: enable 16 block processing.
  dpif-netdev/dpcls: specialize more subtable signatures.
  dpdk: Cache result of CPU ISA checks.
  dpcls-avx512: enabling avx512 vector popcount instruction.
  dpif-netdev: Optimize dp output action
  netdev: Optimize netdev_send_prepare_batch
  dpif-netdev: POC of future DPIF and MFEX AVX512 optimizations

 Documentation/topics/dpdk/bridge.rst   |  37 ++
 NEWS   |  16 +-
 acinclude.m4   |  15 +
 configure.ac   |   1 +
 lib/automake.mk|  12 +-
 lib/dpdk.c |  30 +-
 lib/dpif-netdev-avx512.c   | 362 
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c | 278 ++---
 lib/dpif-netdev-lookup-generic.c   |   7 +-
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.h  | 252 
 lib/dpif-netdev-private-dpcls.h| 127 
 lib/dpif-netdev-private-dpif.c |  99 
 lib/dpif-netdev-private-dpif.h |  85 +++
 lib/dpif-netdev-private-flow.h | 162 +
 lib/dpif-netdev-private-hwol.h |  63 ++
 lib/dpif-netdev-private-thread.h   | 225 +++
 lib/dpif-netdev-private.h  | 123 ++--
 lib/dpif-netdev.c  | 779 +++--
 lib/flow_avx512.h  | 117 
 lib/netdev.c   |  31 +-
 22 files changed, 2069 insertions(+), 755 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-dpif.c
 create mode 100644 lib/dpif-netdev-private-dpif.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-hwol.h
 create mode 100644 lib/dpif-netdev-private-thread.h
 create mode 100644 lib/flow_avx512.h

-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 16/16] netdev: Optimize netdev_send_prepare_batch

2021-01-04 Thread Harry van Haaren

Optimize for the best case here where all packets will be compatible
with 'netdev_flags'.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 NEWS |  2 ++
 lib/netdev.c | 31 ++-
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/NEWS b/NEWS
index b5b140512..7eeb51cfc 100644
--- a/NEWS
+++ b/NEWS
@@ -32,6 +32,8 @@ Post-v2.14.0
CPU supports it. This enhances performance by using the native vpopcount
instructions, instead of the emulated version of vpopcount.
  * Optimize dp_netdev_output by enhancing compiler optimization potential.
+ * Optimize netdev sending by assuming the happy case, and using fallback
+   for if the netdev doesnt meet the required HWOL needs of a packet.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
diff --git a/lib/netdev.c b/lib/netdev.c
index 91e91955c..29a5f1aa9 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -837,20 +837,33 @@ static void
 netdev_send_prepare_batch(const struct netdev *netdev,
   struct dp_packet_batch *batch)
 {
-struct dp_packet *packet;
-size_t i, size = dp_packet_batch_size(batch);
+struct dp_packet *p;
+uint32_t i, size = dp_packet_batch_size(batch);
+char *err_msg = NULL;
 
-DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-char *errormsg = NULL;
+for (i = 0; i < size; i++) {
+p = batch->packets[i];
+int pkt_ok = netdev_send_prepare_packet(netdev->ol_flags, p, _msg);
 
-if (netdev_send_prepare_packet(netdev->ol_flags, packet, )) {
-dp_packet_batch_refill(batch, packet, i);
+if (OVS_UNLIKELY(!pkt_ok)) {
+goto refill_loop;
+}
+}
+
+return;
+
+refill_loop:
+/* Loop through packets from the start of the batch again. This is the
+ * exceptional case where packets aren't compatible with 'netdev_flags'. */
+DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, p, batch) {
+if (netdev_send_prepare_packet(netdev->ol_flags, p, _msg)) {
+dp_packet_batch_refill(batch, p, i);
 } else {
-dp_packet_delete(packet);
+dp_packet_delete(p);
 COVERAGE_INC(netdev_send_prepare_drops);
 VLOG_WARN_RL(, "%s: Packet dropped: %s",
- netdev_get_name(netdev), errormsg);
-free(errormsg);
+ netdev_get_name(netdev), err_msg);
+free(err_msg);
 }
 }
 }
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 15/16] dpif-netdev: Optimize dp output action

2021-01-04 Thread Harry van Haaren

This commit optimizes the output action, by enabling the compiler to
optimize the code better through reducing code complexity.

The core concept of this optimization is that the array-length checks
have already been performed above the copying code, so can be removed.
Removing of the per-packet length checks allows the compiler to auto-vectorize
the stores using SIMD registers.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS  |  1 +
 lib/dpif-netdev.c | 23 ++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index a75ea900c..b5b140512 100644
--- a/NEWS
+++ b/NEWS
@@ -31,6 +31,7 @@ Post-v2.14.0
  * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
CPU supports it. This enhances performance by using the native vpopcount
instructions, instead of the emulated version of vpopcount.
+ * Optimize dp_netdev_output by enhancing compiler optimization potential.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 3168f153b..009e986fc 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7284,12 +7284,25 @@ dp_execute_output_action(struct dp_netdev_pmd_thread 
*pmd,
 pmd->n_output_batches++;
 }
 
-struct dp_packet *packet;
-DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
-p->output_pkts_rxqs[dp_packet_batch_size(>output_pkts)] =
-pmd->ctx.last_rxq;
-dp_packet_batch_add(>output_pkts, packet);
+/* The above checks ensure that there is enough space in the output batch.
+ * Using dp_packet_batch_add() has a branch to check if the batch is full.
+ * This branch reduces the compiler's ability to optimize efficiently. The
+ * below code implements packet movement between batches without checks,
+ * with the required semantics of output batch perhaps containing packets.
+ */
+int batch_size = dp_packet_batch_size(packets_);
+int out_batch_idx = dp_packet_batch_size(>output_pkts);
+struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
+struct dp_packet_batch *output_batch = >output_pkts;
+
+for (int i = 0; i < batch_size; i++) {
+struct dp_packet *packet = packets_->packets[i];
+p->output_pkts_rxqs[out_batch_idx] = rxq;
+output_batch->packets[out_batch_idx] = packet;
+out_batch_idx++;
 }
+output_batch->count += batch_size;
+
 return true;
 }
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 09/16] dpif-netdev: Move pmd_try_optimize function in file.

2021-01-04 Thread Harry van Haaren

This commit moves the pmd_try_optimize function to a more
appropriate location in the file - currently it sits in the
DPCLS section, which is not its correct home.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev.c | 146 +++---
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4c074995c..eea6c11f0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5638,6 +5638,79 @@ reload:
 return NULL;
 }
 
+static inline void
+dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
+   struct polled_queue *poll_list, int poll_cnt)
+{
+struct dpcls *cls;
+uint64_t tot_idle = 0, tot_proc = 0;
+unsigned int pmd_load = 0;
+
+if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
+uint64_t curr_tsc;
+struct pmd_auto_lb *pmd_alb = >dp->pmd_alb;
+if (pmd_alb->is_enabled && !pmd->isolated
+&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
+   pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
+&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
+pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
+{
+tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
+   pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
+tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
+   pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
+
+if (tot_proc) {
+pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
+}
+
+if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
+atomic_count_inc(>pmd_overloaded);
+} else {
+atomic_count_set(>pmd_overloaded, 0);
+}
+}
+
+pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
+pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
+pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
+pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
+
+/* Get the cycles that were used to process each queue and store. */
+for (unsigned i = 0; i < poll_cnt; i++) {
+uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
+RXQ_CYCLES_PROC_CURR);
+dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
+dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
+ 0);
+}
+curr_tsc = cycles_counter_update(>perf_stats);
+if (pmd->intrvl_tsc_prev) {
+/* There is a prev timestamp, store a new intrvl cycle count. */
+atomic_store_relaxed(>intrvl_cycles,
+ curr_tsc - pmd->intrvl_tsc_prev);
+}
+pmd->intrvl_tsc_prev = curr_tsc;
+/* Start new measuring interval */
+pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+}
+
+if (pmd->ctx.now > pmd->next_optimization) {
+/* Try to obtain the flow lock to block out revalidator threads.
+ * If not possible, just try next time. */
+if (!ovs_mutex_trylock(>flow_mutex)) {
+/* Optimize each classifier */
+CMAP_FOR_EACH (cls, node, >classifiers) {
+dpcls_sort_subtable_vector(cls);
+}
+ovs_mutex_unlock(>flow_mutex);
+/* Start new measuring interval */
+pmd->next_optimization = pmd->ctx.now
+ + DPCLS_OPTIMIZATION_INTERVAL;
+}
+}
+}
+
 static void
 dp_netdev_disable_upcall(struct dp_netdev *dp)
 OVS_ACQUIRES(dp->upcall_rwlock)
@@ -8304,79 +8377,6 @@ dpcls_sort_subtable_vector(struct dpcls *cls)
 pvector_publish(pvec);
 }
 
-static inline void
-dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
-   struct polled_queue *poll_list, int poll_cnt)
-{
-struct dpcls *cls;
-uint64_t tot_idle = 0, tot_proc = 0;
-unsigned int pmd_load = 0;
-
-if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
-uint64_t curr_tsc;
-struct pmd_auto_lb *pmd_alb = >dp->pmd_alb;
-if (pmd_alb->is_enabled && !pmd->isolated
-&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
-   pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
-&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
-pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
-{
-tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
-

[ovs-dev] [PATCH v8 14/16] dpcls-avx512: enabling avx512 vector popcount instruction.

2021-01-04 Thread Harry van Haaren

This commit enables the AVX512-VPOPCNTDQ Vector Popcount
instruction. This instruction is not available on every CPU
that supports the AVX512-F Foundation ISA, hence it is enabled
only when the additional VPOPCNTDQ ISA check is passed.

The vector popcount instruction is used instead of the AVX512
popcount emulation code present in the avx512 optimized DPCLS today.
It provides higher performance in the SIMD miniflow processing
as that requires the popcount to calculate the miniflow block indexes.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS   |  3 +
 lib/dpdk.c |  1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 84 --
 3 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/NEWS b/NEWS
index 345cd2696..a75ea900c 100644
--- a/NEWS
+++ b/NEWS
@@ -28,6 +28,9 @@ Post-v2.14.0
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
  * Add more specialized DPCLS subtables to cover common rules, enhancing
the lookup performance.
+ * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
+   CPU supports it. This enhances performance by using the native vpopcount
+   instructions, instead of the emulated version of vpopcount.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
diff --git a/lib/dpdk.c b/lib/dpdk.c
index c883a4b8b..a9494a40f 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
 /* CPU flags only defined for the architecture that support it. */
 CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
 CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
 
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 3a684fadf..9a3273dc6 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,6 +53,15 @@
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+return _mm512_popcnt_epi64(v_in);
+}
+
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
  __mmask64 u1_bcast_msk,  /* mask of u1 lanes */
  const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
  __mmask64 zero_mask, /* maskz if pkt not have mf bit */
- __mmask64 u64_lanes_mask) /* total lane count to use */
+ __mmask64 u64_lanes_mask, /* total lane count to use */
+ const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
 {
 /* Suggest to compiler to load tbl blocks ahead of gather() */
 __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
@@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
   tbl_mf_masks);
 __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
 
-/* Manual AVX512 popcount for u64 lanes. */
-__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+/* Calculate AVX512 popcount for u64 lanes using the native instruction
+ * if available, or using emulation if not available.
+ */
+__m512i v_popcnts;
+if (use_vpop) {
+v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
+} else {
+v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+}
 
 /* Add popcounts and offset for u1 bits. */
 __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
@@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules,
const uint32_t bit_count_u0,
-   const uint32_t bit_count_u1)
+   const uint32_t bit_count_u1,
+   const uint32_t use_vpop)
 {
 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
 uint32_t hashes[NETDEV_MAX_BURST];
@@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 u1_bcast_mask,
 pkt_mf_u0_pop,
 zero_mask,
-bit_count_total_mask);
+

[ovs-dev] [PATCH v8 10/16] dpif-netdev/dpcls: Refactor function names to dpcls.

2021-01-04 Thread Harry van Haaren

This commit refactors the function names from netdev_*
namespace to the dpcls_* namespace, as they are only used
by dpcls code. With the name change, it becomes more obvious
that the functions belong to dpcls functionality, and in the
dpif-netdev-private-dpcls.h header file.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpcls.h |  6 ++
 lib/dpif-netdev.c   | 21 ++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/lib/dpif-netdev-private-dpcls.h b/lib/dpif-netdev-private-dpcls.h
index 5bc579bba..e66cae3f4 100644
--- a/lib/dpif-netdev-private-dpcls.h
+++ b/lib/dpif-netdev-private-dpcls.h
@@ -97,10 +97,8 @@ struct dpcls_subtable {
 
 /* Generates a mask for each bit set in the subtable's miniflow. */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1);
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, uint64_t *mf_masks,
+ const uint32_t mf_bits_u0, const uint32_t mf_bits_u1);
 
 /* Matches a dpcls rule against the incoming packet in 'target' */
 bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index eea6c11f0..3168f153b 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8308,7 +8308,7 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
 subtable->mf_bits_set_unit0 = unit0;
 subtable->mf_bits_set_unit1 = unit1;
 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
-netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
+dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
 
 /* Get the preferred subtable search function for this (u0,u1) subtable.
  * The function is guaranteed to always return a valid implementation, and
@@ -8407,11 +8407,10 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 }
 }
 
-/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
+/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
 static inline void
-netdev_flow_key_gen_mask_unit(uint64_t iter,
-  const uint64_t count,
-  uint64_t *mf_masks)
+dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
+ uint64_t *mf_masks)
 {
 int i;
 for (i = 0; i < count; i++) {
@@ -8432,16 +8431,16 @@ netdev_flow_key_gen_mask_unit(uint64_t iter,
  * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
  */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1)
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+ uint64_t *mf_masks,
+ const uint32_t mf_bits_u0,
+ const uint32_t mf_bits_u1)
 {
 uint64_t iter_u0 = tbl->mf.map.bits[0];
 uint64_t iter_u1 = tbl->mf.map.bits[1];
 
-netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
-netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
+dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
+dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
 }
 
 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 13/16] dpdk: Cache result of CPU ISA checks.

2021-01-04 Thread Harry van Haaren

As a small optimization, this patch caches the result of a CPU ISA
check from DPDK. Particularly in the case of running the DPCLS
autovalidator (which repeatedly probes subtables) this reduces
the amount of CPU ISA lookups from the DPDK level.

By caching them at the OVS/dpdk.c level, the ISA checks remain
runtime for the CPU where they are executed, but subsequent checks
for the same ISA feature become much cheaper.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v8: Add NEWS entry.
---
 NEWS   |  1 +
 lib/dpdk.c | 28 
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 40a395789..345cd2696 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,7 @@ Post-v2.14.0
- DPDK:
  * Removed support for vhost-user dequeue zero-copy.
  * Add support for DPDK 20.11.
+ * Cache results for CPU ISA checks, reduces overhead on repeated lookups.
- Userspace datapath:
  * Add the 'pmd' option to "ovs-appctl dpctl/dump-flows", which
restricts a flow dump to a single PMD thread if set.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394..c883a4b8b 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -614,13 +614,33 @@ print_dpdk_version(void)
 puts(rte_version());
 }
 
+/* Avoid calling rte_cpu_get_flag_enabled() excessively, by caching the
+ * result of the call for each CPU flag in a static variable. To avoid
+ * allocating large numbers of static variables, use a uint8 as a bitfield.
+ * Note the macro must only return if the ISA check is done and available.
+ */
+#define ISA_CHECK_DONE_BIT (1 << 0)
+#define ISA_AVAILABLE_BIT  (1 << 1)
+
 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG)   \
 do {\
 if (strncmp(feature, name_str, strlen(name_str)) == 0) {\
-int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
-VLOG_DBG("CPU flag %s, available %s\n", name_str,   \
-  has_isa ? "yes" : "no");  \
-return true;\
+static uint8_t isa_check_##RTE_CPUFLAG; \
+int check = isa_check_##RTE_CPUFLAG & ISA_CHECK_DONE_BIT;   \
+if (OVS_UNLIKELY(!check)) { \
+int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
+VLOG_DBG("CPU flag %s, available %s\n", \
+ name_str, has_isa ? "yes" : "no"); \
+isa_check_##RTE_CPUFLAG = ISA_CHECK_DONE_BIT;   \
+if (has_isa) {  \
+isa_check_##RTE_CPUFLAG |= ISA_AVAILABLE_BIT;   \
+}   \
+}   \
+if (isa_check_##RTE_CPUFLAG & ISA_AVAILABLE_BIT) {  \
+return true;\
+} else {\
+return false;   \
+}   \
 }   \
 } while (0)
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 12/16] dpif-netdev/dpcls: specialize more subtable signatures.

2021-01-04 Thread Harry van Haaren

This commit adds more subtables to be specialized. The traffic
pattern here being matched is VXLAN traffic subtables, which commonly
have (5,3), (9,1) and (9,4) subtable fingerprints.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry.
---
 NEWS   | 2 ++
 lib/dpif-netdev-lookup-avx512-gather.c | 6 ++
 lib/dpif-netdev-lookup-generic.c   | 6 ++
 3 files changed, 14 insertions(+)

diff --git a/NEWS b/NEWS
index 86733312d..40a395789 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ Post-v2.14.0
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
  * Add commands to get and set the dpif implementations.
  * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
+ * Add more specialized DPCLS subtables to cover common rules, enhancing
+   the lookup performance.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 1f27c0536..3a684fadf 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -299,6 +299,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -331,6 +334,9 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, 
uint32_t u1_bits)
 return NULL;
 }
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index e3b6be4b6..6c74ac3a1 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -282,6 +282,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable 
*subtable,
 return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1);  \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -303,6 +306,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t 
u1_bits)
 {
 dpcls_subtable_lookup_func f = NULL;
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 08/16] docs/dpdk/bridge: Add dpif performance section.

2021-01-04 Thread Harry van Haaren

From: Cian Ferriter 

This section details how two new commands can be used to list and select
the different dpif implementations. It also details how a non default
dpif implementation can be tested with the OVS unit test suite.

Add NEWS updates for the dpif-netdev.c refactor and the new dpif
implementations/commands.

Signed-off-by: Cian Ferriter 

---

v8:
- Merge NEWS file items into one Userspace Datapath: heading
---
 Documentation/topics/dpdk/bridge.rst | 37 
 NEWS |  4 +++
 2 files changed, 41 insertions(+)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 526d5c959..ca90d7bdb 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -214,3 +214,40 @@ implementation ::
 
 Compile OVS in debug mode to have `ovs_assert` statements error out if
 there is a mis-match in the DPCLS lookup implementation.
+
+Datapath Interface Performance
+--
+
+The datapath interface (DPIF) or dp_netdev_input() is responsible for taking
+packets through the major components of the userspace datapath; such as
+miniflow_extract, EMC, SMC and DPCLS lookups, and a lot of the performance
+stats associated with the datapath.
+
+Just like with the SIMD DPCLS work above, SIMD can be applied to the DPIF to
+improve performance.
+
+OVS provides multiple implementations of the DPIF. These can be listed with the
+following command ::
+
+$ ovs-appctl dpif-netdev/dpif-get
+Available DPIF implementations:
+  dpif_scalar
+  dpif_avx512
+
+By default, dpif_scalar is used. The DPIF implementation can be selected by
+name ::
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_avx512
+DPIF implementation set to dpif_avx512.
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_scalar
+DPIF implementation set to dpif_scalar.
+
+Running Unit Tests with AVX512 DPIF
+~~~
+
+Since the AVX512 DPIF is disabled by default, a compile time option is
+available in order to test it with the OVS unit test suite. When building with
+a CPU that supports AVX512, use the following configure option ::
+
+$ ./configure --enable-dpif-default-avx512
diff --git a/NEWS b/NEWS
index d357da31d..f7e5f7f7d 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,10 @@ Post-v2.14.0
  * Add generic IP protocol support to conntrack. With this change, all
none UDP, TCP, and ICMP traffic will be treated as general L3
traffic, i.e. using 3 tupples.
+ * Refactor lib/dpif-netdev.c to multiple header files.
+ * Add avx512 implementation of dpif which can process non recirculated
+   packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
+ * Add commands to get and set the dpif implementations.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 01/16] dpif-netdev: Refactor to multiple header files.

2021-01-04 Thread Harry van Haaren

Split the very large file dpif-netdev.c and the datastructures
it contains into multiple header files. Each header file is
responsible for the datastructures of that component.

This logical split allows better reuse and modularity of the code,
and reduces the very large file dpif-netdev.c to be more managable.

Due to dependencies between components, it is not possible to
move component in smaller granularities than this patch.

To explain the dependencies better, eg:

DPCLS has no deps (from dpif-netdev.c file)
FLOW depends on DPCLS (struct dpcls_rule)
DFC depends on DPCLS (netdev_flow_key) and FLOW (netdev_flow_key)
THREAD depends on DFC (struct dfc_cache)

DFC_PROC depends on THREAD (struct pmd_thread)

DPCLS lookup.h/c require only DPCLS
DPCLS implementations require only dpif-netdev-lookup.h.
- This change was made in 2.12 release with function pointers
- This commit only refactors the name to "private-dpcls.h"

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 lib/automake.mk|   4 +
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c |   1 -
 lib/dpif-netdev-lookup-generic.c   |   1 -
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.h  | 244 
 lib/dpif-netdev-private-dpcls.h| 129 ++
 lib/dpif-netdev-private-flow.h | 162 
 lib/dpif-netdev-private-thread.h   | 206 ++
 lib/dpif-netdev-private.h  | 100 +
 lib/dpif-netdev.c  | 519 +
 11 files changed, 760 insertions(+), 609 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-thread.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 380a67228..22a281fcc 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -111,6 +111,10 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-lookup-generic.c \
lib/dpif-netdev.c \
lib/dpif-netdev.h \
+   lib/dpif-netdev-private-dfc.h \
+   lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
lib/dpif-netdev-perf.h \
diff --git a/lib/dpif-netdev-lookup-autovalidator.c 
b/lib/dpif-netdev-lookup-autovalidator.c
index 97b59fdd0..475e1ab1e 100644
--- a/lib/dpif-netdev-lookup-autovalidator.c
+++ b/lib/dpif-netdev-lookup-autovalidator.c
@@ -17,7 +17,6 @@
 #include 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_autovalidator);
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 5e3634249..8fc1cdfa5 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -21,7 +21,6 @@
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "cmap.h"
 #include "flow.h"
 #include "pvector.h"
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index b1a0cfc36..e3b6be4b6 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -17,7 +17,6 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
 #include "dpif-netdev-lookup.h"
 
 #include "bitmap.h"
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index bd72aa29b..59f51faa0 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -19,7 +19,7 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
 
 /* Function to perform a probe for the subtable bit fingerprint.
  * Returns NULL if not valid, or a valid function pointer to call for this
diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
new file mode 100644
index 0..8f6a4899e
--- /dev/null
+++ b/lib/dpif-netdev-private-dfc.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2019, 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implie

[ovs-dev] [PATCH v8 11/16] dpif-netdev/dpcls-avx512: enable 16 block processing.

2021-01-04 Thread Harry van Haaren

This commit implements larger subtable searches in avx512. A limitation
of the previous implementation was that up to 8 blocks of miniflow
data could be matched on (so a subtable with 8 blocks was handled
in avx, but 9 blocks or more would fall back to scalar/generic).
This limitation is removed in this patch, where up to 16 blocks
of subtable can be matched on.

>From an implementation perspective, the key to enabling 16 blocks
over 8 blocks was to do bitmask calculation up front, and then use
the pre-calculated bitmasks for 2x passes of the "blocks gather"
routine. The bitmasks need to be shifted for k-mask usage in the
upper (8-15) block range, but it is relatively trivial. This also
helps in case expanding to 24 blocks is desired in future.

The implementation of the 2nd iteration to handle > 8 blocks is
behind a conditional branch which checks the total number of bits.
This helps the specialized versions of the function that have a
miniflow fingerprint of less-than-or-equal 8 blocks, as the code
can be statically stripped out of those functions. Specialized
functions that do require more than 8 blocks will have the branch
removed and unconditionally execute the 2nd blocks gather routine.

Lastly, the _any() flavour will have the conditional branch, and
the branch predictor may mispredict a bit, but per burst will
likely get most packets correct (particularly towards the middle
and end of a burst).

The code has been run with unit tests under autovalidation and
passes all cases, and unit test coverage has been checked to
ensure the 16 block code paths are executing.

Signed-off-by: Harry van Haaren 

---

v8: Add NEWS entry
---
 NEWS   |   1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 203 ++---
 2 files changed, 147 insertions(+), 57 deletions(-)

diff --git a/NEWS b/NEWS
index f7e5f7f7d..86733312d 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,7 @@ Post-v2.14.0
  * Add avx512 implementation of dpif which can process non recirculated
packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
  * Add commands to get and set the dpif implementations.
+ * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
- The environment variable OVS_UNBOUND_CONF, if set, is now used
  as the DNS resolver's (unbound) configuration file.
- Linux datapath:
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 8fc1cdfa5..1f27c0536 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -34,7 +34,21 @@
  * AVX512 code at a time.
  */
 #define NUM_U64_IN_ZMM_REG (8)
-#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+/* This implementation of AVX512 gather allows up to 16 blocks of MF data to be
+ * present in the blocks_cache, hence the multiply by 2 in the blocks count.
+ */
+#define MF_BLOCKS_PER_PACKET (NUM_U64_IN_ZMM_REG * 2)
+
+/* Blocks cache size is the maximum number of miniflow blocks that this
+ * implementation of lookup can handle.
+ */
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * MF_BLOCKS_PER_PACKET)
+
+/* The gather instruction can handle a scale for the size of the items to
+ * gather. For uint64_t data, this scale is 8.
+ */
+#define GATHER_SCALE_8 (8)
 
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
@@ -69,22 +83,83 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
 {
 const uint64_t *keyp = miniflow_get_values(>flow.mf);
 const uint64_t *maskp = miniflow_get_values(>mask->mf);
-const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+const uint32_t lane_mask = (1ULL << mf_bits_total) - 1;
 
 /* Always load a full cache line from blocks_cache. Other loads must be
  * trimmed to the amount of data required for mf_bits_total blocks.
  */
-__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
-__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
-__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+uint32_t res_mask;
+
+{
+__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+}
 
-__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
-uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+if (mf_bits_total > 8) {
+uint32_t lane_mask_gt8 = lane_mask >> 8;
+__m512i v_blocks = _mm512_loadu_si512(_cache[8]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+uint32_t c = _mm5

[ovs-dev] [PATCH v8 04/16] dpif-avx512: Add ISA implementation of dpif.

2021-01-04 Thread Harry van Haaren

This commit adds the AVX512 implementation of DPIF functionality,
specifically the dp_netdev_input_outer_avx512 function. This function
only handles outer (no re-circulations), and is optimized to use the
AVX512 ISA for packet batching and other DPIF work.

Sparse is not able to handle the AVX512 intrinsics, causing compile
time failures, so it is disabled for this file.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v8:
- Fixup AVX512 mask to uint32_t conversion compilation warning.
---
 lib/automake.mk  |   5 +-
 lib/dpif-netdev-avx512.c | 264 +++
 lib/dpif-netdev-private-dfc.h|   8 +
 lib/dpif-netdev-private-dpif.h   |  32 
 lib/dpif-netdev-private-thread.h |  11 +-
 lib/dpif-netdev-private.h|  25 +++
 lib/dpif-netdev.c|  70 ++--
 7 files changed, 399 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dpif.h

diff --git a/lib/automake.mk b/lib/automake.mk
index eccfaf3e3..650207940 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
-mavx512f \
-mavx512bw \
-mavx512dq \
+   -mbmi \
-mbmi2 \
-fPIC \
$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
-   lib/dpif-netdev-lookup-avx512-gather.c
+   lib/dpif-netdev-lookup-avx512-gather.c \
+   lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
-static
 endif
@@ -113,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
new file mode 100644
index 0..10228aeb0
--- /dev/null
+++ b/lib/dpif-netdev-avx512.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions */
+#if !defined(__CHECKER__)
+
+#include 
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-perf.h"
+
+#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-flow.h"
+#include "dpif-netdev-private-thread.h"
+
+#include "dp-packet.h"
+#include "netdev.h"
+
+#include "immintrin.h"
+
+/* Structure to contain per-packet metadata that must be attributed to the
+ * dp netdev flow. This is unfortunate to have to track per packet, however
+ * it's a bit awkward to maintain them in a performant way. This structure
+ * helps to keep two variables on a single cache line per packet.
+ */
+struct pkt_flow_meta {
+uint16_t bytes;
+uint16_t tcp_flags;
+};
+
+/* Structure of heap allocated memory for DPIF internals. */
+struct dpif_userdata {
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key keys[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
+};
+
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t in_port)
+{
+/* Allocate DPIF userdata. */
+if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
+pmd->netdev_input_func_userdata =
+xmalloc_pagealign(sizeof(struct dpif_userdata));
+}
+
+struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
+struct netdev_flow_key *keys = ud->keys;
+struct netdev_flow_key **key_ptrs = ud->key_ptrs;
+struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
+
+/* Stores the computed output: a rule pointer for each packet */
+/* The AVX512 DPIF implementation handles rules in a way that is optimized
+ * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
+ * achieved by separating the rule arrays. Bitmasks are kept for each
+ * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
+

[ovs-dev] [PATCH v8 06/16] dpif-netdev: Add command to switch dpif implementation.

2021-01-04 Thread Harry van Haaren

This commit adds a new command to allow the user to switch
the active DPIF implementation at runtime. A probe function
is executed before switching the DPIF implementation, to ensure
the CPU is capable of running the ISA required. For example, the
below code will switch to the AVX512 enabled DPIF assuming
that the runtime CPU is capable of running AVX512 instructions:

 $ ovs-appctl dpif-netdev/dpif-set dpif_avx512

A new configuration flag is added to allow selection of the
default DPIF. This is useful for running the unit-tests against
the available DPIF implementations, without modifying each unit test.

The design of the testing & validation for ISA optimized DPIF
implementations is based around the work already upstream for DPCLS.
Note however that a DPCLS lookup has no state or side-effects, allowing
the auto-validator implementation to perform multiple lookups and
provide consistent statistic counters.

The DPIF component does have state, so running two implementations in
parallel and comparing output is not a valid testing method, as there
are changes in DPIF statistic counters (side effects). As a result, the
DPIF is tested directly against the unit-tests.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 acinclude.m4 | 15 ++
 configure.ac |  1 +
 lib/automake.mk  |  1 +
 lib/dpif-netdev-avx512.c | 14 +
 lib/dpif-netdev-private-dpif.c   | 92 
 lib/dpif-netdev-private-dpif.h   | 43 ++-
 lib/dpif-netdev-private-thread.h | 12 +
 lib/dpif-netdev.c| 86 +++--
 8 files changed, 248 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dpif.c

diff --git a/acinclude.m4 b/acinclude.m4
index 60871f67a..bf7f29825 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -30,6 +30,21 @@ AC_DEFUN([OVS_CHECK_DPCLS_AUTOVALIDATOR], [
   fi
 ])
 
+dnl Set OVS DPIF default implementation at configure time for running the unit
+dnl tests on the whole codebase without modifying tests per DPIF impl
+AC_DEFUN([OVS_CHECK_DPIF_AVX512_DEFAULT], [
+  AC_ARG_ENABLE([dpif-default-avx512],
+[AC_HELP_STRING([--enable-dpif-default-avx512], [Enable DPIF 
AVX512 implementation as default.])],
+[dpifavx512=yes],[dpifavx512=no])
+  AC_MSG_CHECKING([whether DPIF AVX512 is default implementation])
+  if test "$dpifavx512" != yes; then
+AC_MSG_RESULT([no])
+  else
+OVS_CFLAGS="$OVS_CFLAGS -DDPIF_AVX512_DEFAULT"
+AC_MSG_RESULT([yes])
+  fi
+])
+
 dnl OVS_ENABLE_WERROR
 AC_DEFUN([OVS_ENABLE_WERROR],
   [AC_ARG_ENABLE(
diff --git a/configure.ac b/configure.ac
index 126a1d9d1..76b1e4fec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -185,6 +185,7 @@ OVS_ENABLE_WERROR
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
 OVS_CHECK_DPCLS_AUTOVALIDATOR
+OVS_CHECK_DPIF_AVX512_DEFAULT
 OVS_CHECK_BINUTILS_AVX512
 
 AC_ARG_VAR(KARCH, [Kernel Architecture String])
diff --git a/lib/automake.mk b/lib/automake.mk
index 650207940..2a41f7ab5 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -115,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.c \
lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index caba1fa1c..fff469e10 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -19,6 +19,7 @@
 #if !defined(__CHECKER__)
 
 #include 
+#include 
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-perf.h"
@@ -54,6 +55,19 @@ struct dpif_userdata {
 struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
 };
 
+int32_t
+dp_netdev_input_outer_avx512_probe(void)
+{
+int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+
+if (!avx512f_available || !bmi2_available) {
+return -ENOTSUP;
+}
+
+return 0;
+}
+
 int32_t
 dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
  struct dp_packet_batch *packets,
diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
new file mode 100644
index 0..9e1f3b8f9
--- /dev/null
+++ b/lib/dpif-netdev-private-dpif.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licens

[ovs-dev] [PATCH v8 07/16] dpif-netdev: Add command to get dpif implementations.

2021-01-04 Thread Harry van Haaren

This commit adds a new command to retrieve the list of available
DPIF implementations. This can be used by to check what implementations
of the DPIF are available in any given OVS binary.

Usage:
 $ ovs-appctl dpif-netdev/dpif-get

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpif.c |  7 +++
 lib/dpif-netdev-private-dpif.h |  6 ++
 lib/dpif-netdev.c  | 24 
 3 files changed, 37 insertions(+)

diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
index 9e1f3b8f9..c5021fe9f 100644
--- a/lib/dpif-netdev-private-dpif.c
+++ b/lib/dpif-netdev-private-dpif.c
@@ -61,6 +61,13 @@ dp_netdev_impl_get_default(void)
 return func;
 }
 
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls)
+{
+ovs_assert(out_impls);
+*out_impls = dpif_impls;
+return ARRAY_SIZE(dpif_impls);
+}
 
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
index a09f90acc..99fbda943 100644
--- a/lib/dpif-netdev-private-dpif.h
+++ b/lib/dpif-netdev-private-dpif.h
@@ -47,6 +47,12 @@ struct dpif_netdev_impl_info_t {
 const char *name;
 };
 
+/* This function returns all available implementations to the caller. The
+ * quantity of implementations is returned by the int return value.
+ */
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls);
+
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
  */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2a0b103c3..4c074995c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -990,6 +990,27 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, 
int argc,
 ds_destroy();
 }
 
+static void
+dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
+ const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+const struct dpif_netdev_impl_info_t *dpif_impls;
+uint32_t count = dp_netdev_impl_get(_impls);
+if (count == 0) {
+unixctl_command_reply_error(conn, "error getting dpif names");
+return;
+}
+
+/* Add all dpif functions to reply string. */
+struct ds reply = DS_EMPTY_INITIALIZER;
+ds_put_cstr(, "Available DPIF implementations:\n");
+for (uint32_t i = 0; i < count; i++) {
+ds_put_format(, "  %s\n", dpif_impls[i].name);
+}
+unixctl_command_reply(conn, ds_cstr());
+ds_destroy();
+}
+
 static void
 dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
  const char *argv[], void *aux OVS_UNUSED)
@@ -1288,6 +1309,9 @@ dpif_netdev_init(void)
  "[dpif implementation name] [dp]",
  1, 2, dpif_netdev_impl_set,
  NULL);
+unixctl_command_register("dpif-netdev/dpif-get", "",
+ 0, 0, dpif_netdev_impl_get,
+ NULL);
 return 0;
 }
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 05/16] dpif-avx512: Add HWOL support to avx512 dpif.

2021-01-04 Thread Harry van Haaren

Partial hardware offload is implemented in a very similar way to the
scalar dpif.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-avx512.c | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 10228aeb0..caba1fa1c 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -27,6 +27,7 @@
 #include "dpif-netdev-private-dpcls.h"
 #include "dpif-netdev-private-flow.h"
 #include "dpif-netdev-private-thread.h"
+#include "dpif-netdev-private-hwol.h"
 
 #include "dp-packet.h"
 #include "netdev.h"
@@ -111,9 +112,32 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 uint32_t i = __builtin_ctz(iter);
 iter = _blsr_u64(iter);
 
-/* Initialize packet md and do miniflow extract */
+/* Get packet pointer from bitmask and packet md */
 struct dp_packet *packet = packets->packets[i];
 pkt_metadata_init(>md, in_port);
+
+struct dp_netdev_flow *f = NULL;
+
+/* Check for partial hardware offload mark */
+uint32_t mark;
+if (dp_packet_has_flow_mark(packet, )) {
+f = mark_to_flow_find(pmd, mark);
+if (f) {
+rules[i] = >cr;
+
+/* This is nasty - instead of using the HWOL provided flow,
+ * parse the packet data anyway to find the location of the TCP
+ * header to extract the TCP flags for the rule.
+ */
+pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
+
+pkt_meta[i].bytes = dp_packet_size(packet);
+hwol_emc_smc_hitmask |= (1 << i);
+continue;
+}
+}
+
+/* Do miniflow extract into keys */
 struct netdev_flow_key *key = [i];
 miniflow_extract(packet, >mf);
 
@@ -124,8 +148,6 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 key->len = netdev_flow_key_size(miniflow_n_values(>mf));
 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >mf);
 
-struct dp_netdev_flow *f = NULL;
-
 if (emc_enabled) {
 f = emc_lookup(>emc_cache, key);
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 03/16] dpif-netdev: Add function pointer for netdev input.

2021-01-04 Thread Harry van Haaren

This commit adds a function pointer to the pmd thread data structure,
giving the pmd thread flexibility in its dpif-input function choice.
This allows choosing of the implementation based on ISA capabilities
of the runtime CPU, leading to optimizations and higher performance.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-thread.h | 12 
 lib/dpif-netdev.c|  7 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index a5b3ae360..089223aaf 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -47,6 +47,13 @@ struct dp_netdev_pmd_thread_ctx {
 uint32_t emc_insert_min;
 };
 
+/* Forward declaration for typedef */
+struct dp_netdev_pmd_thread;
+
+typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t port_no);
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -101,6 +108,11 @@ struct dp_netdev_pmd_thread {
 /* Current context of the PMD thread. */
 struct dp_netdev_pmd_thread_ctx ctx;
 
+/* Function pointer to call for dp_netdev_input() functionality. */
+dp_netdev_input_func netdev_input_func;
+/* Pointer for per-DPIF implementation scratch space. */
+void *netdev_input_func_userdata;
+
 struct seq *reload_seq;
 uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 55fbf0f16..dfdcf6218 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4177,8 +4177,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 }
+
 /* Process packet batch. */
-dp_netdev_input(pmd, , port_no);
+pmd->netdev_input_func(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
@@ -5962,6 +5963,10 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
*pmd, struct dp_netdev *dp,
 hmap_init(>tnl_port_cache);
 hmap_init(>send_port_cache);
 cmap_init(>tx_bonds);
+
+/* Initialize the DPIF function pointer to the default scalar version */
+pmd->netdev_input_func = dp_netdev_input;
+
 /* init the 'flow_cache' since there is no
  * actual thread created for NON_PMD_CORE_ID. */
 if (core_id == NON_PMD_CORE_ID) {
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 02/16] dpif-netdev: Split HWOL out to own header file.

2021-01-04 Thread Harry van Haaren

This commit moves the datapath lookup functions required for
hardware offload to a seperate file. This allows other DPIF
implementations to access the lookup functions, encouraging
code reuse.

Signed-off-by: Harry van Haaren 
---
 lib/automake.mk|  1 +
 lib/dpif-netdev-private-hwol.h | 63 ++
 lib/dpif-netdev.c  | 39 ++---
 3 files changed, 67 insertions(+), 36 deletions(-)
 create mode 100644 lib/dpif-netdev-private-hwol.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 22a281fcc..eccfaf3e3 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
diff --git a/lib/dpif-netdev-private-hwol.h b/lib/dpif-netdev-private-hwol.h
new file mode 100644
index 0..447010ab8
--- /dev/null
+++ b/lib/dpif-netdev-private-hwol.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_PRIVATE_HWOL_H
+#define DPIF_NETDEV_PRIVATE_HWOL_H 1
+
+#include "dpif-netdev-private-flow.h"
+
+#define MAX_FLOW_MARK   (UINT32_MAX - 1)
+#define INVALID_FLOW_MARK   0
+/* Zero flow mark is used to indicate the HW to remove the mark. A packet
+ * marked with zero mark is received in SW without a mark at all, so it
+ * cannot be used as a valid mark.
+ */
+
+struct megaflow_to_mark_data {
+const struct cmap_node node;
+ovs_u128 mega_ufid;
+uint32_t mark;
+};
+
+struct flow_mark {
+struct cmap megaflow_to_mark;
+struct cmap mark_to_flow;
+struct id_pool *pool;
+};
+
+/* allocated in dpif-netdev.c */
+extern struct flow_mark flow_mark;
+
+static inline struct dp_netdev_flow *
+mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
+  const uint32_t mark)
+{
+struct dp_netdev_flow *flow;
+
+CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
+ _mark.mark_to_flow) {
+if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
+flow->dead == false) {
+return flow;
+}
+}
+
+return NULL;
+}
+
+
+#endif /* dpif-netdev-private-hwol.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7be9d664e..55fbf0f16 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -84,6 +84,8 @@
 #include "util.h"
 #include "uuid.h"
 
+#include "dpif-netdev-private-hwol.h"
+
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
 /* Auto Load Balancing Defaults */
@@ -1949,26 +1951,8 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread 
*pmd,
 return cls;
 }
 
-#define MAX_FLOW_MARK   (UINT32_MAX - 1)
-#define INVALID_FLOW_MARK   0
-/* Zero flow mark is used to indicate the HW to remove the mark. A packet
- * marked with zero mark is received in SW without a mark at all, so it
- * cannot be used as a valid mark.
- */
-
-struct megaflow_to_mark_data {
-const struct cmap_node node;
-ovs_u128 mega_ufid;
-uint32_t mark;
-};
-
-struct flow_mark {
-struct cmap megaflow_to_mark;
-struct cmap mark_to_flow;
-struct id_pool *pool;
-};
 
-static struct flow_mark flow_mark = {
+struct flow_mark flow_mark = {
 .megaflow_to_mark = CMAP_INITIALIZER,
 .mark_to_flow = CMAP_INITIALIZER,
 };
@@ -2137,23 +2121,6 @@ flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
 }
 }
 
-static struct dp_netdev_flow *
-mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
-  const uint32_t mark)
-{
-struct dp_netdev_flow *flow;
-
-CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
- _mark.mark_to_flow) {
-if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
-flow->dead == false) {
-return flow;
-}
-}
-
-return NULL;
-}
-
 static struct dp_flow_offload_item *
 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
  struct dp_netdev_flow *flow,
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v8 00/16] DPIF Generif Framework

2021-01-04 Thread Harry van Haaren

-usability in other .c files.

Questions welcomed! Regards, -Harry



Cian Ferriter (1):
  docs/dpdk/bridge: Add dpif performance section.

Harry van Haaren (15):
  dpif-netdev: Refactor to multiple header files.
  dpif-netdev: Split HWOL out to own header file.
  dpif-netdev: Add function pointer for netdev input.
  dpif-avx512: Add ISA implementation of dpif.
  dpif-avx512: Add HWOL support to avx512 dpif.
  dpif-netdev: Add command to switch dpif implementation.
  dpif-netdev: Add command to get dpif implementations.
  dpif-netdev: Move pmd_try_optimize function in file.
  dpif-netdev/dpcls: Refactor function names to dpcls.
  dpif-netdev/dpcls-avx512: enable 16 block processing.
  dpif-netdev/dpcls: specialize more subtable signatures.
  dpdk: Cache result of CPU ISA checks.
  dpcls-avx512: enabling avx512 vector popcount instruction.
  dpif-netdev: Optimize dp output action
  netdev: Optimize netdev_send_prepare_batch

 Documentation/topics/dpdk/bridge.rst   |  37 +
 NEWS   |  14 +
 acinclude.m4   |  15 +
 configure.ac   |   1 +
 lib/automake.mk|  11 +-
 lib/dpdk.c |  29 +-
 lib/dpif-netdev-avx512.c   | 300 
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c | 278 ++--
 lib/dpif-netdev-lookup-generic.c   |   7 +-
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.h  | 252 +++
 lib/dpif-netdev-private-dpcls.h| 127 
 lib/dpif-netdev-private-dpif.c |  99 +++
 lib/dpif-netdev-private-dpif.h |  79 +++
 lib/dpif-netdev-private-flow.h | 162 +
 lib/dpif-netdev-private-hwol.h |  63 ++
 lib/dpif-netdev-private-thread.h   | 215 ++
 lib/dpif-netdev-private.h  | 123 +---
 lib/dpif-netdev.c  | 925 -
 lib/netdev.c   |  31 +-
 21 files changed, 1944 insertions(+), 827 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-dpif.c
 create mode 100644 lib/dpif-netdev-private-dpif.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-hwol.h
 create mode 100644 lib/dpif-netdev-private-thread.h

-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 11/14] dpif-netdev/dpcls-avx512: enable 16 block processing.

2020-12-16 Thread Harry van Haaren

This commit implements larger subtable searches in avx512. A limitation
of the previous implementation was that up to 8 blocks of miniflow
data could be matched on (so a subtable with 8 blocks was handled
in avx, but 9 blocks or more would fall back to scalar/generic).
This limitation is removed in this patch, where up to 16 blocks
of subtable can be matched on.

>From an implementation perspective, the key to enabling 16 blocks
over 8 blocks was to do bitmask calculation up front, and then use
the pre-calculated bitmasks for 2x passes of the "blocks gather"
routine. The bitmasks need to be shifted for k-mask usage in the
upper (8-15) block range, but it is relatively trivial. This also
helps in case expanding to 24 blocks is desired in future.

The implementation of the 2nd iteration to handle > 8 blocks is
behind a conditional branch which checks the total number of bits.
This helps the specialized versions of the function that have a
miniflow fingerprint of less-than-or-equal 8 blocks, as the code
can be statically stripped out of those functions. Specialized
functions that do require more than 8 blocks will have the branch
removed and unconditionally execute the 2nd blocks gather routine.

Lastly, the _any() flavour will have the conditional branch, and
the branch predictor may mispredict a bit, but per burst will
likely get most packets correct (particularly towards the middle
and end of a burst).

The code has been run with unit tests under autovalidation and
passes all cases, and unit test coverage has been checked to
ensure the 16 block code paths are executing.

Signed-off-by: Harry van Haaren 

---

v7:
- Fixup long line (0day build bot)
---
 lib/dpif-netdev-lookup-avx512-gather.c | 203 ++---
 1 file changed, 146 insertions(+), 57 deletions(-)

diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 8fc1cdfa5..1f27c0536 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -34,7 +34,21 @@
  * AVX512 code at a time.
  */
 #define NUM_U64_IN_ZMM_REG (8)
-#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+/* This implementation of AVX512 gather allows up to 16 blocks of MF data to be
+ * present in the blocks_cache, hence the multiply by 2 in the blocks count.
+ */
+#define MF_BLOCKS_PER_PACKET (NUM_U64_IN_ZMM_REG * 2)
+
+/* Blocks cache size is the maximum number of miniflow blocks that this
+ * implementation of lookup can handle.
+ */
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * MF_BLOCKS_PER_PACKET)
+
+/* The gather instruction can handle a scale for the size of the items to
+ * gather. For uint64_t data, this scale is 8.
+ */
+#define GATHER_SCALE_8 (8)
 
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
@@ -69,22 +83,83 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
 {
 const uint64_t *keyp = miniflow_get_values(>flow.mf);
 const uint64_t *maskp = miniflow_get_values(>mask->mf);
-const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+const uint32_t lane_mask = (1ULL << mf_bits_total) - 1;
 
 /* Always load a full cache line from blocks_cache. Other loads must be
  * trimmed to the amount of data required for mf_bits_total blocks.
  */
-__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
-__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
-__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+uint32_t res_mask;
+
+{
+__m512i v_blocks = _mm512_loadu_si512(_cache[0]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask, [0]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+}
 
-__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
-uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+if (mf_bits_total > 8) {
+uint32_t lane_mask_gt8 = lane_mask >> 8;
+__m512i v_blocks = _mm512_loadu_si512(_cache[8]);
+__m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_key= _mm512_maskz_loadu_epi64(lane_mask_gt8, [8]);
+__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+uint32_t c = _mm512_mask_cmpeq_epi64_mask(lane_mask_gt8, v_data,
+  v_key);
+res_mask |= (c << 8);
+}
 
-/* returns 1 assuming result of SIMD compare is all blocks. */
+/* returns 1 assuming result of SIMD compare is all blocks matching. */
 return res_mask == lane_mask;
 }
 
+/* Takes u0 and u1 inputs, and gathers the next 8 blocks to be stored
+ * contigously into the blocks cache. Note that the pointers and bitmasks
+ * passed into this function must be incremented for handling next 8 blocks.
+ */
+static inline ALWAYS_INLIN

[ovs-dev] [PATCH v7 08/14] docs/dpdk/bridge: Add dpif performance section.

2020-12-16 Thread Harry van Haaren

From: Cian Ferriter 

This section details how two new commands can be used to list and select
the different dpif implementations. It also details how a non default
dpif implementation can be tested with the OVS unit test suite.

Add NEWS updates for the dpif-netdev.c refactor and the new dpif
implementations/commands.

Signed-off-by: Cian Ferriter 
---
 Documentation/topics/dpdk/bridge.rst | 37 
 NEWS |  5 
 2 files changed, 42 insertions(+)

diff --git a/Documentation/topics/dpdk/bridge.rst 
b/Documentation/topics/dpdk/bridge.rst
index 526d5c959..ca90d7bdb 100644
--- a/Documentation/topics/dpdk/bridge.rst
+++ b/Documentation/topics/dpdk/bridge.rst
@@ -214,3 +214,40 @@ implementation ::
 
 Compile OVS in debug mode to have `ovs_assert` statements error out if
 there is a mis-match in the DPCLS lookup implementation.
+
+Datapath Interface Performance
+--
+
+The datapath interface (DPIF) or dp_netdev_input() is responsible for taking
+packets through the major components of the userspace datapath; such as
+miniflow_extract, EMC, SMC and DPCLS lookups, and a lot of the performance
+stats associated with the datapath.
+
+Just like with the SIMD DPCLS work above, SIMD can be applied to the DPIF to
+improve performance.
+
+OVS provides multiple implementations of the DPIF. These can be listed with the
+following command ::
+
+$ ovs-appctl dpif-netdev/dpif-get
+Available DPIF implementations:
+  dpif_scalar
+  dpif_avx512
+
+By default, dpif_scalar is used. The DPIF implementation can be selected by
+name ::
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_avx512
+DPIF implementation set to dpif_avx512.
+
+$ ovs-appctl dpif-netdev/dpif-set dpif_scalar
+DPIF implementation set to dpif_scalar.
+
+Running Unit Tests with AVX512 DPIF
+~~~
+
+Since the AVX512 DPIF is disabled by default, a compile time option is
+available in order to test it with the OVS unit test suite. When building with
+a CPU that supports AVX512, use the following configure option ::
+
+$ ./configure --enable-dpif-default-avx512
diff --git a/NEWS b/NEWS
index 1a39cc661..30a3d8dbb 100644
--- a/NEWS
+++ b/NEWS
@@ -28,6 +28,11 @@ Post-v2.14.0
OpenFlow bundle actions.
- Support for GitHub Actions based continuous integration builds has been
  added.
+   - Userspace datapath:
+ * Refactor lib/dpif-netdev.c to multiple header files.
+ * Add avx512 implementation of dpif which can process non recirculated
+   packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
+ * Add commands to get and set the dpif implementations.
 
 
 v2.14.0 - 17 Aug 2020
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 14/14] dpcls-avx512: enabling avx512 vector popcount instruction.

2020-12-16 Thread Harry van Haaren

This commit enables the AVX512-VPOPCNTDQ Vector Popcount
instruction. This instruction is not available on every CPU
that supports the AVX512-F Foundation ISA, hence it is enabled
only when the additional VPOPCNTDQ ISA check is passed.

The vector popcount instruction is used instead of the AVX512
popcount emulation code present in the avx512 optimized DPCLS today.
It provides higher performance in the SIMD miniflow processing
as that requires the popcount to calculate the miniflow block indexes.

Signed-off-by: Harry van Haaren 

---

v7:
- Remove code that handled unused variable (0day build bot)

v6:
- Now that the DPDK 20.11 dependency exists, it is possible to use the
  RTE_CPUFLAG_* defines to enable the AVX512 vectorized popcount instruction.
---
 lib/dpdk.c |  1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 84 --
 2 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index c883a4b8b..a9494a40f 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
 /* CPU flags only defined for the architecture that support it. */
 CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
 CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
 
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 3a684fadf..9a3273dc6 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,6 +53,15 @@
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+return _mm512_popcnt_epi64(v_in);
+}
+
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
  __mmask64 u1_bcast_msk,  /* mask of u1 lanes */
  const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
  __mmask64 zero_mask, /* maskz if pkt not have mf bit */
- __mmask64 u64_lanes_mask) /* total lane count to use */
+ __mmask64 u64_lanes_mask, /* total lane count to use */
+ const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
 {
 /* Suggest to compiler to load tbl blocks ahead of gather() */
 __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
@@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 
bits */
   tbl_mf_masks);
 __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
 
-/* Manual AVX512 popcount for u64 lanes. */
-__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+/* Calculate AVX512 popcount for u64 lanes using the native instruction
+ * if available, or using emulation if not available.
+ */
+__m512i v_popcnts;
+if (use_vpop) {
+v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
+} else {
+v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+}
 
 /* Add popcounts and offset for u1 bits. */
 __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
@@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules,
const uint32_t bit_count_u0,
-   const uint32_t bit_count_u1)
+   const uint32_t bit_count_u1,
+   const uint32_t use_vpop)
 {
 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
 uint32_t hashes[NETDEV_MAX_BURST];
@@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 u1_bcast_mask,
 pkt_mf_u0_pop,
 zero_mask,
-bit_count_total_mask);
+bit_count_total_mask,
+use_vpop);
 _mm512_storeu_si512(_cache[i * MF_BLOCKS_PER_PACKET], v_blocks);
 
 if (bit_count_total > 8) {
@@ -239,7 +258,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 u1_bcast_mask_gt8,
 pkt_mf_u0_pop,
 zero_mask_gt8,
-

[ovs-dev] [PATCH v7 10/14] dpif-netdev/dpcls: Refactor function names to dpcls.

2020-12-16 Thread Harry van Haaren

This commit refactors the function names from netdev_*
namespace to the dpcls_* namespace, as they are only used
by dpcls code. With the name change, it becomes more obvious
that the functions belong to dpcls functionality, and in the
dpif-netdev-private-dpcls.h header file.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpcls.h |  6 ++
 lib/dpif-netdev.c   | 21 ++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/lib/dpif-netdev-private-dpcls.h b/lib/dpif-netdev-private-dpcls.h
index 5bc579bba..e66cae3f4 100644
--- a/lib/dpif-netdev-private-dpcls.h
+++ b/lib/dpif-netdev-private-dpcls.h
@@ -97,10 +97,8 @@ struct dpcls_subtable {
 
 /* Generates a mask for each bit set in the subtable's miniflow. */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1);
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, uint64_t *mf_masks,
+ const uint32_t mf_bits_u0, const uint32_t mf_bits_u1);
 
 /* Matches a dpcls rule against the incoming packet in 'target' */
 bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index eea6c11f0..3168f153b 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8308,7 +8308,7 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
 subtable->mf_bits_set_unit0 = unit0;
 subtable->mf_bits_set_unit1 = unit1;
 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
-netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
+dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
 
 /* Get the preferred subtable search function for this (u0,u1) subtable.
  * The function is guaranteed to always return a valid implementation, and
@@ -8407,11 +8407,10 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 }
 }
 
-/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
+/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
 static inline void
-netdev_flow_key_gen_mask_unit(uint64_t iter,
-  const uint64_t count,
-  uint64_t *mf_masks)
+dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
+ uint64_t *mf_masks)
 {
 int i;
 for (i = 0; i < count; i++) {
@@ -8432,16 +8431,16 @@ netdev_flow_key_gen_mask_unit(uint64_t iter,
  * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
  */
 void
-netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
-  uint64_t *mf_masks,
-  const uint32_t mf_bits_u0,
-  const uint32_t mf_bits_u1)
+dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+ uint64_t *mf_masks,
+ const uint32_t mf_bits_u0,
+ const uint32_t mf_bits_u1)
 {
 uint64_t iter_u0 = tbl->mf.map.bits[0];
 uint64_t iter_u1 = tbl->mf.map.bits[1];
 
-netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
-netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
+dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, _masks[0]);
+dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, _masks[mf_bits_u0]);
 }
 
 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 09/14] dpif-netdev: Move pmd_try_optimize function in file.

2020-12-16 Thread Harry van Haaren

This commit moves the pmd_try_optimize function to a more
appropriate location in the file - currently it sits in the
DPCLS section, which is not its correct home.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev.c | 146 +++---
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4c074995c..eea6c11f0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5638,6 +5638,79 @@ reload:
 return NULL;
 }
 
+static inline void
+dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
+   struct polled_queue *poll_list, int poll_cnt)
+{
+struct dpcls *cls;
+uint64_t tot_idle = 0, tot_proc = 0;
+unsigned int pmd_load = 0;
+
+if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
+uint64_t curr_tsc;
+struct pmd_auto_lb *pmd_alb = >dp->pmd_alb;
+if (pmd_alb->is_enabled && !pmd->isolated
+&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
+   pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
+&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
+pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
+{
+tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
+   pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
+tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
+   pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
+
+if (tot_proc) {
+pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
+}
+
+if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
+atomic_count_inc(>pmd_overloaded);
+} else {
+atomic_count_set(>pmd_overloaded, 0);
+}
+}
+
+pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
+pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
+pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
+pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
+
+/* Get the cycles that were used to process each queue and store. */
+for (unsigned i = 0; i < poll_cnt; i++) {
+uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
+RXQ_CYCLES_PROC_CURR);
+dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
+dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
+ 0);
+}
+curr_tsc = cycles_counter_update(>perf_stats);
+if (pmd->intrvl_tsc_prev) {
+/* There is a prev timestamp, store a new intrvl cycle count. */
+atomic_store_relaxed(>intrvl_cycles,
+ curr_tsc - pmd->intrvl_tsc_prev);
+}
+pmd->intrvl_tsc_prev = curr_tsc;
+/* Start new measuring interval */
+pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+}
+
+if (pmd->ctx.now > pmd->next_optimization) {
+/* Try to obtain the flow lock to block out revalidator threads.
+ * If not possible, just try next time. */
+if (!ovs_mutex_trylock(>flow_mutex)) {
+/* Optimize each classifier */
+CMAP_FOR_EACH (cls, node, >classifiers) {
+dpcls_sort_subtable_vector(cls);
+}
+ovs_mutex_unlock(>flow_mutex);
+/* Start new measuring interval */
+pmd->next_optimization = pmd->ctx.now
+ + DPCLS_OPTIMIZATION_INTERVAL;
+}
+}
+}
+
 static void
 dp_netdev_disable_upcall(struct dp_netdev *dp)
 OVS_ACQUIRES(dp->upcall_rwlock)
@@ -8304,79 +8377,6 @@ dpcls_sort_subtable_vector(struct dpcls *cls)
 pvector_publish(pvec);
 }
 
-static inline void
-dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
-   struct polled_queue *poll_list, int poll_cnt)
-{
-struct dpcls *cls;
-uint64_t tot_idle = 0, tot_proc = 0;
-unsigned int pmd_load = 0;
-
-if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
-uint64_t curr_tsc;
-struct pmd_auto_lb *pmd_alb = >dp->pmd_alb;
-if (pmd_alb->is_enabled && !pmd->isolated
-&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
-   pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
-&& (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
-pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
-{
-tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
-

[ovs-dev] [PATCH v7 13/14] dpdk: Cache result of CPU ISA checks.

2020-12-16 Thread Harry van Haaren

As a small optimization, this patch caches the result of a CPU ISA
check from DPDK. Particularly in the case of running the DPCLS
autovalidator (which repeatedly probes subtables) this reduces
the amount of CPU ISA lookups from the DPDK level.

By caching them at the OVS/dpdk.c level, the ISA checks remain
runtime for the CPU where they are executed, but subsequent checks
for the same ISA feature become much cheaper.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v7:
- Improve debug output if CPU ISA is not available.
---
 lib/dpdk.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394..c883a4b8b 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -614,13 +614,33 @@ print_dpdk_version(void)
 puts(rte_version());
 }
 
+/* Avoid calling rte_cpu_get_flag_enabled() excessively, by caching the
+ * result of the call for each CPU flag in a static variable. To avoid
+ * allocating large numbers of static variables, use a uint8 as a bitfield.
+ * Note the macro must only return if the ISA check is done and available.
+ */
+#define ISA_CHECK_DONE_BIT (1 << 0)
+#define ISA_AVAILABLE_BIT  (1 << 1)
+
 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG)   \
 do {\
 if (strncmp(feature, name_str, strlen(name_str)) == 0) {\
-int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
-VLOG_DBG("CPU flag %s, available %s\n", name_str,   \
-  has_isa ? "yes" : "no");  \
-return true;\
+static uint8_t isa_check_##RTE_CPUFLAG; \
+int check = isa_check_##RTE_CPUFLAG & ISA_CHECK_DONE_BIT;   \
+if (OVS_UNLIKELY(!check)) { \
+int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
+VLOG_DBG("CPU flag %s, available %s\n", \
+ name_str, has_isa ? "yes" : "no"); \
+isa_check_##RTE_CPUFLAG = ISA_CHECK_DONE_BIT;   \
+if (has_isa) {  \
+isa_check_##RTE_CPUFLAG |= ISA_AVAILABLE_BIT;   \
+}   \
+}   \
+if (isa_check_##RTE_CPUFLAG & ISA_AVAILABLE_BIT) {  \
+return true;\
+} else {\
+return false;   \
+}   \
 }   \
 } while (0)
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 12/14] dpif-netdev/dpcls: specialize more subtable signatures.

2020-12-16 Thread Harry van Haaren

This commit adds more subtables to be specialized. The traffic
pattern here being matched is VXLAN traffic subtables, which commonly
have (5,3), (9,1) and (9,4) subtable fingerprints.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-lookup-avx512-gather.c | 6 ++
 lib/dpif-netdev-lookup-generic.c   | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 1f27c0536..3a684fadf 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -299,6 +299,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
 return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -331,6 +334,9 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, 
uint32_t u1_bits)
 return NULL;
 }
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index e3b6be4b6..6c74ac3a1 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -282,6 +282,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable 
*subtable,
 return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1);  \
 } \
 
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
@@ -303,6 +306,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t 
u1_bits)
 {
 dpcls_subtable_lookup_func f = NULL;
 
+CHECK_LOOKUP_FUNCTION(9, 4);
+CHECK_LOOKUP_FUNCTION(9, 1);
+CHECK_LOOKUP_FUNCTION(5, 3);
 CHECK_LOOKUP_FUNCTION(5, 1);
 CHECK_LOOKUP_FUNCTION(4, 1);
 CHECK_LOOKUP_FUNCTION(4, 0);
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 07/14] dpif-netdev: Add command to get dpif implementations.

2020-12-16 Thread Harry van Haaren

This commit adds a new command to retrieve the list of available
DPIF implementations. This can be used by to check what implementations
of the DPIF are available in any given OVS binary.

Usage:
 $ ovs-appctl dpif-netdev/dpif-get

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-private-dpif.c |  7 +++
 lib/dpif-netdev-private-dpif.h |  6 ++
 lib/dpif-netdev.c  | 24 
 3 files changed, 37 insertions(+)

diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
index 9e1f3b8f9..c5021fe9f 100644
--- a/lib/dpif-netdev-private-dpif.c
+++ b/lib/dpif-netdev-private-dpif.c
@@ -61,6 +61,13 @@ dp_netdev_impl_get_default(void)
 return func;
 }
 
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls)
+{
+ovs_assert(out_impls);
+*out_impls = dpif_impls;
+return ARRAY_SIZE(dpif_impls);
+}
 
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
index a09f90acc..99fbda943 100644
--- a/lib/dpif-netdev-private-dpif.h
+++ b/lib/dpif-netdev-private-dpif.h
@@ -47,6 +47,12 @@ struct dpif_netdev_impl_info_t {
 const char *name;
 };
 
+/* This function returns all available implementations to the caller. The
+ * quantity of implementations is returned by the int return value.
+ */
+uint32_t
+dp_netdev_impl_get(const struct dpif_netdev_impl_info_t **out_impls);
+
 /* This function checks all available DPIF implementations, and selects the
  * returns the function pointer to the one requested by "name".
  */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2a0b103c3..4c074995c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -990,6 +990,27 @@ dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, 
int argc,
 ds_destroy();
 }
 
+static void
+dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
+ const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+const struct dpif_netdev_impl_info_t *dpif_impls;
+uint32_t count = dp_netdev_impl_get(_impls);
+if (count == 0) {
+unixctl_command_reply_error(conn, "error getting dpif names");
+return;
+}
+
+/* Add all dpif functions to reply string. */
+struct ds reply = DS_EMPTY_INITIALIZER;
+ds_put_cstr(, "Available DPIF implementations:\n");
+for (uint32_t i = 0; i < count; i++) {
+ds_put_format(, "  %s\n", dpif_impls[i].name);
+}
+unixctl_command_reply(conn, ds_cstr());
+ds_destroy();
+}
+
 static void
 dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
  const char *argv[], void *aux OVS_UNUSED)
@@ -1288,6 +1309,9 @@ dpif_netdev_init(void)
  "[dpif implementation name] [dp]",
  1, 2, dpif_netdev_impl_set,
  NULL);
+unixctl_command_register("dpif-netdev/dpif-get", "",
+ 0, 0, dpif_netdev_impl_get,
+ NULL);
 return 0;
 }
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 04/14] dpif-avx512: Add ISA implementation of dpif.

2020-12-16 Thread Harry van Haaren

This commit adds the AVX512 implementation of DPIF functionality,
specifically the dp_netdev_input_outer_avx512 function. This function
only handles outer (no re-circulations), and is optimized to use the
AVX512 ISA for packet batching and other DPIF work.

Sparse is not able to handle the AVX512 intrinsics, causing compile
time failures, so it is disabled for this file.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 

---

v7:
- Improve DPIF scratch by allocating from heap
- Simplify k-mask handling in AVX512 flow* compare batching
---
 lib/automake.mk  |   5 +-
 lib/dpif-netdev-avx512.c | 264 +++
 lib/dpif-netdev-private-dfc.h|   8 +
 lib/dpif-netdev-private-dpif.h   |  32 
 lib/dpif-netdev-private-thread.h |  11 +-
 lib/dpif-netdev-private.h|  25 +++
 lib/dpif-netdev.c|  70 ++--
 7 files changed, 399 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dpif.h

diff --git a/lib/automake.mk b/lib/automake.mk
index eccfaf3e3..650207940 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
-mavx512f \
-mavx512bw \
-mavx512dq \
+   -mbmi \
-mbmi2 \
-fPIC \
$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
-   lib/dpif-netdev-lookup-avx512-gather.c
+   lib/dpif-netdev-lookup-avx512-gather.c \
+   lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
-static
 endif
@@ -113,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
new file mode 100644
index 0..d83eaf001
--- /dev/null
+++ b/lib/dpif-netdev-avx512.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions */
+#if !defined(__CHECKER__)
+
+#include 
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-perf.h"
+
+#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-flow.h"
+#include "dpif-netdev-private-thread.h"
+
+#include "dp-packet.h"
+#include "netdev.h"
+
+#include "immintrin.h"
+
+/* Structure to contain per-packet metadata that must be attributed to the
+ * dp netdev flow. This is unfortunate to have to track per packet, however
+ * it's a bit awkward to maintain them in a performant way. This structure
+ * helps to keep two variables on a single cache line per packet.
+ */
+struct pkt_flow_meta {
+uint16_t bytes;
+uint16_t tcp_flags;
+};
+
+/* Structure of heap allocated memory for DPIF internals. */
+struct dpif_userdata {
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key keys[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
+OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
+};
+
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t in_port)
+{
+/* Allocate DPIF userdata. */
+if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
+pmd->netdev_input_func_userdata =
+xmalloc_pagealign(sizeof(struct dpif_userdata));
+}
+
+struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
+struct netdev_flow_key *keys = ud->keys;
+struct netdev_flow_key **key_ptrs = ud->key_ptrs;
+struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
+
+/* Stores the computed output: a rule pointer for each packet */
+/* The AVX512 DPIF implementation handles rules in a way that is optimized
+ * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
+ * achieved by separating the rule arrays. Bitmasks are kept for each
+ * packet, indicatin

[ovs-dev] [PATCH v7 06/14] dpif-netdev: Add command to switch dpif implementation.

2020-12-16 Thread Harry van Haaren

This commit adds a new command to allow the user to switch
the active DPIF implementation at runtime. A probe function
is executed before switching the DPIF implementation, to ensure
the CPU is capable of running the ISA required. For example, the
below code will switch to the AVX512 enabled DPIF assuming
that the runtime CPU is capable of running AVX512 instructions:

 $ ovs-appctl dpif-netdev/dpif-set dpif_avx512

A new configuration flag is added to allow selection of the
default DPIF. This is useful for running the unit-tests against
the available DPIF implementations, without modifying each unit test.

The design of the testing & validation for ISA optimized DPIF
implementations is based around the work already upstream for DPCLS.
Note however that a DPCLS lookup has no state or side-effects, allowing
the auto-validator implementation to perform multiple lookups and
provide consistent statistic counters.

The DPIF component does have state, so running two implementations in
parallel and comparing output is not a valid testing method, as there
are changes in DPIF statistic counters (side effects). As a result, the
DPIF is tested directly against the unit-tests.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 acinclude.m4 | 15 ++
 configure.ac |  1 +
 lib/automake.mk  |  1 +
 lib/dpif-netdev-avx512.c | 14 +
 lib/dpif-netdev-private-dpif.c   | 92 
 lib/dpif-netdev-private-dpif.h   | 43 ++-
 lib/dpif-netdev-private-thread.h | 12 +
 lib/dpif-netdev.c| 86 +++--
 8 files changed, 248 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dpif.c

diff --git a/acinclude.m4 b/acinclude.m4
index eb0496c25..6fe9fdf60 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -30,6 +30,21 @@ AC_DEFUN([OVS_CHECK_DPCLS_AUTOVALIDATOR], [
   fi
 ])
 
+dnl Set OVS DPIF default implementation at configure time for running the unit
+dnl tests on the whole codebase without modifying tests per DPIF impl
+AC_DEFUN([OVS_CHECK_DPIF_AVX512_DEFAULT], [
+  AC_ARG_ENABLE([dpif-default-avx512],
+[AC_HELP_STRING([--enable-dpif-default-avx512], [Enable DPIF 
AVX512 implementation as default.])],
+[dpifavx512=yes],[dpifavx512=no])
+  AC_MSG_CHECKING([whether DPIF AVX512 is default implementation])
+  if test "$dpifavx512" != yes; then
+AC_MSG_RESULT([no])
+  else
+OVS_CFLAGS="$OVS_CFLAGS -DDPIF_AVX512_DEFAULT"
+AC_MSG_RESULT([yes])
+  fi
+])
+
 dnl OVS_ENABLE_WERROR
 AC_DEFUN([OVS_ENABLE_WERROR],
   [AC_ARG_ENABLE(
diff --git a/configure.ac b/configure.ac
index 126a1d9d1..76b1e4fec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -185,6 +185,7 @@ OVS_ENABLE_WERROR
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
 OVS_CHECK_DPCLS_AUTOVALIDATOR
+OVS_CHECK_DPIF_AVX512_DEFAULT
 OVS_CHECK_BINUTILS_AVX512
 
 AC_ARG_VAR(KARCH, [Kernel Architecture String])
diff --git a/lib/automake.mk b/lib/automake.mk
index 650207940..2a41f7ab5 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -115,6 +115,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev.h \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-dpif.c \
lib/dpif-netdev-private-dpif.h \
lib/dpif-netdev-private-flow.h \
lib/dpif-netdev-private-hwol.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 24d82864b..dfe2c853a 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -19,6 +19,7 @@
 #if !defined(__CHECKER__)
 
 #include 
+#include 
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-perf.h"
@@ -54,6 +55,19 @@ struct dpif_userdata {
 struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
 };
 
+int32_t
+dp_netdev_input_outer_avx512_probe(void)
+{
+int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+
+if (!avx512f_available || !bmi2_available) {
+return -ENOTSUP;
+}
+
+return 0;
+}
+
 int32_t
 dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
  struct dp_packet_batch *packets,
diff --git a/lib/dpif-netdev-private-dpif.c b/lib/dpif-netdev-private-dpif.c
new file mode 100644
index 0..9e1f3b8f9
--- /dev/null
+++ b/lib/dpif-netdev-private-dpif.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licens

[ovs-dev] [PATCH v7 01/14] dpif-netdev: Refactor to multiple header files.

2020-12-16 Thread Harry van Haaren

Split the very large file dpif-netdev.c and the datastructures
it contains into multiple header files. Each header file is
responsible for the datastructures of that component.

This logical split allows better reuse and modularity of the code,
and reduces the very large file dpif-netdev.c to be more managable.

Due to dependencies between components, it is not possible to
move component in smaller granularities than this patch.

To explain the dependencies better, eg:

DPCLS has no deps (from dpif-netdev.c file)
FLOW depends on DPCLS (struct dpcls_rule)
DFC depends on DPCLS (netdev_flow_key) and FLOW (netdev_flow_key)
THREAD depends on DFC (struct dfc_cache)

DFC_PROC depends on THREAD (struct pmd_thread)

DPCLS lookup.h/c require only DPCLS
DPCLS implementations require only dpif-netdev-lookup.h.
- This change was made in 2.12 release with function pointers
- This commit only refactors the name to "private-dpcls.h"

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 lib/automake.mk|   4 +
 lib/dpif-netdev-lookup-autovalidator.c |   1 -
 lib/dpif-netdev-lookup-avx512-gather.c |   1 -
 lib/dpif-netdev-lookup-generic.c   |   1 -
 lib/dpif-netdev-lookup.h   |   2 +-
 lib/dpif-netdev-private-dfc.h  | 244 
 lib/dpif-netdev-private-dpcls.h| 129 ++
 lib/dpif-netdev-private-flow.h | 162 
 lib/dpif-netdev-private-thread.h   | 206 ++
 lib/dpif-netdev-private.h  | 100 +
 lib/dpif-netdev.c  | 519 +
 11 files changed, 760 insertions(+), 609 deletions(-)
 create mode 100644 lib/dpif-netdev-private-dfc.h
 create mode 100644 lib/dpif-netdev-private-dpcls.h
 create mode 100644 lib/dpif-netdev-private-flow.h
 create mode 100644 lib/dpif-netdev-private-thread.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 380a67228..22a281fcc 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -111,6 +111,10 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-lookup-generic.c \
lib/dpif-netdev.c \
lib/dpif-netdev.h \
+   lib/dpif-netdev-private-dfc.h \
+   lib/dpif-netdev-private-dpcls.h \
+   lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
lib/dpif-netdev-perf.h \
diff --git a/lib/dpif-netdev-lookup-autovalidator.c 
b/lib/dpif-netdev-lookup-autovalidator.c
index 97b59fdd0..475e1ab1e 100644
--- a/lib/dpif-netdev-lookup-autovalidator.c
+++ b/lib/dpif-netdev-lookup-autovalidator.c
@@ -17,7 +17,6 @@
 #include 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_autovalidator);
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
b/lib/dpif-netdev-lookup-avx512-gather.c
index 5e3634249..8fc1cdfa5 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -21,7 +21,6 @@
 
 #include "dpif-netdev.h"
 #include "dpif-netdev-lookup.h"
-#include "dpif-netdev-private.h"
 #include "cmap.h"
 #include "flow.h"
 #include "pvector.h"
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index b1a0cfc36..e3b6be4b6 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -17,7 +17,6 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
 #include "dpif-netdev-lookup.h"
 
 #include "bitmap.h"
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index bd72aa29b..59f51faa0 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -19,7 +19,7 @@
 
 #include 
 #include "dpif-netdev.h"
-#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
 
 /* Function to perform a probe for the subtable bit fingerprint.
  * Returns NULL if not valid, or a valid function pointer to call for this
diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
new file mode 100644
index 0..8f6a4899e
--- /dev/null
+++ b/lib/dpif-netdev-private-dfc.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2019, 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implie

[ovs-dev] [PATCH v7 03/14] dpif-netdev: Add function pointer for netdev input.

2020-12-16 Thread Harry van Haaren

This commit adds a function pointer to the pmd thread data structure,
giving the pmd thread flexibility in its dpif-input function choice.
This allows choosing of the implementation based on ISA capabilities
of the runtime CPU, leading to optimizations and higher performance.

Signed-off-by: Harry van Haaren 

---

v7:
- Add void* for per-DPIF userdata
---
 lib/dpif-netdev-private-thread.h | 12 
 lib/dpif-netdev.c|  7 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index a5b3ae360..089223aaf 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -47,6 +47,13 @@ struct dp_netdev_pmd_thread_ctx {
 uint32_t emc_insert_min;
 };
 
+/* Forward declaration for typedef */
+struct dp_netdev_pmd_thread;
+
+typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets,
+ odp_port_t port_no);
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -101,6 +108,11 @@ struct dp_netdev_pmd_thread {
 /* Current context of the PMD thread. */
 struct dp_netdev_pmd_thread_ctx ctx;
 
+/* Function pointer to call for dp_netdev_input() functionality. */
+dp_netdev_input_func netdev_input_func;
+/* Pointer for per-DPIF implementation scratch space. */
+void *netdev_input_func_userdata;
+
 struct seq *reload_seq;
 uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 55fbf0f16..dfdcf6218 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4177,8 +4177,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
 }
 }
 }
+
 /* Process packet batch. */
-dp_netdev_input(pmd, , port_no);
+pmd->netdev_input_func(pmd, , port_no);
 
 /* Assign processing cycles to rx queue. */
 cycles = cycle_timer_stop(>perf_stats, );
@@ -5962,6 +5963,10 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
*pmd, struct dp_netdev *dp,
 hmap_init(>tnl_port_cache);
 hmap_init(>send_port_cache);
 cmap_init(>tx_bonds);
+
+/* Initialize the DPIF function pointer to the default scalar version */
+pmd->netdev_input_func = dp_netdev_input;
+
 /* init the 'flow_cache' since there is no
  * actual thread created for NON_PMD_CORE_ID. */
 if (core_id == NON_PMD_CORE_ID) {
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 05/14] dpif-avx512: Add HWOL support to avx512 dpif.

2020-12-16 Thread Harry van Haaren

Partial hardware offload is implemented in a very similar way to the
scalar dpif.

Signed-off-by: Harry van Haaren 
---
 lib/dpif-netdev-avx512.c | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index d83eaf001..24d82864b 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -27,6 +27,7 @@
 #include "dpif-netdev-private-dpcls.h"
 #include "dpif-netdev-private-flow.h"
 #include "dpif-netdev-private-thread.h"
+#include "dpif-netdev-private-hwol.h"
 
 #include "dp-packet.h"
 #include "netdev.h"
@@ -111,9 +112,32 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 uint32_t i = __builtin_ctz(iter);
 iter = _blsr_u64(iter);
 
-/* Initialize packet md and do miniflow extract */
+/* Get packet pointer from bitmask and packet md */
 struct dp_packet *packet = packets->packets[i];
 pkt_metadata_init(>md, in_port);
+
+struct dp_netdev_flow *f = NULL;
+
+/* Check for partial hardware offload mark */
+uint32_t mark;
+if (dp_packet_has_flow_mark(packet, )) {
+f = mark_to_flow_find(pmd, mark);
+if (f) {
+rules[i] = >cr;
+
+/* This is nasty - instead of using the HWOL provided flow,
+ * parse the packet data anyway to find the location of the TCP
+ * header to extract the TCP flags for the rule.
+ */
+pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
+
+pkt_meta[i].bytes = dp_packet_size(packet);
+hwol_emc_smc_hitmask |= (1 << i);
+continue;
+}
+}
+
+/* Do miniflow extract into keys */
 struct netdev_flow_key *key = [i];
 miniflow_extract(packet, >mf);
 
@@ -124,8 +148,6 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
*pmd,
 key->len = netdev_flow_key_size(miniflow_n_values(>mf));
 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >mf);
 
-struct dp_netdev_flow *f = NULL;
-
 if (emc_enabled) {
 f = emc_lookup(>emc_cache, key);
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 02/14] dpif-netdev: Split HWOL out to own header file.

2020-12-16 Thread Harry van Haaren

This commit moves the datapath lookup functions required for
hardware offload to a seperate file. This allows other DPIF
implementations to access the lookup functions, encouraging
code reuse.

Signed-off-by: Harry van Haaren 
---
 lib/automake.mk|  1 +
 lib/dpif-netdev-private-hwol.h | 63 ++
 lib/dpif-netdev.c  | 39 ++---
 3 files changed, 67 insertions(+), 36 deletions(-)
 create mode 100644 lib/dpif-netdev-private-hwol.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 22a281fcc..eccfaf3e3 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpif-netdev-private-dfc.h \
lib/dpif-netdev-private-dpcls.h \
lib/dpif-netdev-private-flow.h \
+   lib/dpif-netdev-private-hwol.h \
lib/dpif-netdev-private-thread.h \
lib/dpif-netdev-private.h \
lib/dpif-netdev-perf.c \
diff --git a/lib/dpif-netdev-private-hwol.h b/lib/dpif-netdev-private-hwol.h
new file mode 100644
index 0..447010ab8
--- /dev/null
+++ b/lib/dpif-netdev-private-hwol.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
+ * Copyright (c) 2020 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_PRIVATE_HWOL_H
+#define DPIF_NETDEV_PRIVATE_HWOL_H 1
+
+#include "dpif-netdev-private-flow.h"
+
+#define MAX_FLOW_MARK   (UINT32_MAX - 1)
+#define INVALID_FLOW_MARK   0
+/* Zero flow mark is used to indicate the HW to remove the mark. A packet
+ * marked with zero mark is received in SW without a mark at all, so it
+ * cannot be used as a valid mark.
+ */
+
+struct megaflow_to_mark_data {
+const struct cmap_node node;
+ovs_u128 mega_ufid;
+uint32_t mark;
+};
+
+struct flow_mark {
+struct cmap megaflow_to_mark;
+struct cmap mark_to_flow;
+struct id_pool *pool;
+};
+
+/* allocated in dpif-netdev.c */
+extern struct flow_mark flow_mark;
+
+static inline struct dp_netdev_flow *
+mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
+  const uint32_t mark)
+{
+struct dp_netdev_flow *flow;
+
+CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
+ _mark.mark_to_flow) {
+if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
+flow->dead == false) {
+return flow;
+}
+}
+
+return NULL;
+}
+
+
+#endif /* dpif-netdev-private-hwol.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7be9d664e..55fbf0f16 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -84,6 +84,8 @@
 #include "util.h"
 #include "uuid.h"
 
+#include "dpif-netdev-private-hwol.h"
+
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
 /* Auto Load Balancing Defaults */
@@ -1949,26 +1951,8 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread 
*pmd,
 return cls;
 }
 
-#define MAX_FLOW_MARK   (UINT32_MAX - 1)
-#define INVALID_FLOW_MARK   0
-/* Zero flow mark is used to indicate the HW to remove the mark. A packet
- * marked with zero mark is received in SW without a mark at all, so it
- * cannot be used as a valid mark.
- */
-
-struct megaflow_to_mark_data {
-const struct cmap_node node;
-ovs_u128 mega_ufid;
-uint32_t mark;
-};
-
-struct flow_mark {
-struct cmap megaflow_to_mark;
-struct cmap mark_to_flow;
-struct id_pool *pool;
-};
 
-static struct flow_mark flow_mark = {
+struct flow_mark flow_mark = {
 .megaflow_to_mark = CMAP_INITIALIZER,
 .mark_to_flow = CMAP_INITIALIZER,
 };
@@ -2137,23 +2121,6 @@ flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
 }
 }
 
-static struct dp_netdev_flow *
-mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
-  const uint32_t mark)
-{
-struct dp_netdev_flow *flow;
-
-CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
- _mark.mark_to_flow) {
-if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
-flow->dead == false) {
-return flow;
-}
-}
-
-return NULL;
-}
-
 static struct dp_flow_offload_item *
 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
  struct dp_netdev_flow *flow,
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v7 00/14] DPIF Generic Framework

2020-12-16 Thread Harry van Haaren

v7 summary:
- OVS Conference included DPIF overview, youtube link:
--- https://youtu.be/5dWyPxiXEhg
- Rebased and tested on the DPDK 20.11 v4 patch
--- Link: https://patchwork.ozlabs.org/project/openvswitch/list/?series=220645
--- Tested this series for shared/static builds
--- Tested this series with/without -march=
- Minor code improvements in DPIF component (see commits for details)
- Improved CPU ISA checks, caching results
- Commit message improvements (.'s etc)
- Added performance data of patchset
--- Note that the benchmark below does not utilize the AVX512-vpopcntdq
--- optimizations, and performance is expected to improve when used.
--- Further optimizations are planned that continue.

Benchmark Details & Results
===

Intel® Xeon® Gold 6230 CPU @2.10GHz
OVS*-DPDK* Phy-Phy Performance 4x 25G Ports - Total 1 million flows
1C1T-4P, 64-byte frame size, performance in mpps:

Results Table:
---
DPIF  | Scalar | Scalar | AVX512 | AVX512 |
DPCLS | Scalar | AVX512 | Scalar | AVX512 |
---
mpps  |  6.955 |  7.530 |  7.530 |  7.962 |

By enabling both AVX512 DPIF and DPCLS, packet forwarding
is  7.962 / 6.955 = 1.1447x faster, aka 14% speedup.



v6 summary:
- Rebase to DPDK 20.11 enabling patch
--- This creates a dependency, expect CI build failures on the last
patch in this series if it is not applied!
- Small improvements to DPIF layer
--- EMC/SMC enabling in AVX512 DPIF cleanups
- CPU ISA flags are cached, lowering overhead
- Wilcard Classifier DPCLS
--- Refactor and cleanups for function names
--- Enable more subtable specializations
--- Enable AVX512 vpopcount instruction


v5 summary:
- Dropped MFEX optimizations, re-targetting to a later release
--- This allows focus of community reviews & development on DPIF
--- Note OVS Conference talk still introduces both DPIF and MFEX topics
- DPIF improvements
--- Better EMC/SMC handling
--- HWOL is enabled in the avx512 DPIF
--- Documentation & NEWS items added
--- Various smaller improvements

v4 summary:
- Updated and improve DPIF component
--- SMC now implemented
--- EMC handling improved
--- Novel batching method using AVX512 implemented
--- see commits for details
- Updated Miniflow Extract component
--- Improved AVX512 code path performance
--- Implemented multiple TODO item's in v3
--- Add "disable" implementation to return to scalar miniflow only
--- More fixes planned for v5/future revisions:
 Rename command to better reflect usage
 Improve dynamicness of patterns
 Add more demo protocols to show usage
- Future work
--- Documentation/NEWS items
--- Statistics for optimized MFEX
- Note that this patchset will be discussed/presented at OvsConf soon :)

v3 update summary:
(Cian Ferriter helping with rebases, review and code cleanups)
- Split out partially related changes (these will be sent separately)
--- netdev output action optimization
--- avx512 dpcls 16-block support optimization
- Squash commit which moves netdev struct flow into the refactor commit:
--- Squash dpif-netdev: move netdev flow struct to header
--- Into dpif-netdev: Refactor to multiple header files
- Implement Miniflow extract for AVX-512 DPIF
--- A generic method of matching patterns and packets is implemented,
providing traffic-pattern specific miniflow-extract acceleration.
--- The patterns today are hard-coded, however in a future patchset it
is intended to make these runtime configurable, allowing users to
optimize the SIMD miniflow extract for active traffic types.
- Notes:
--- 32 bit builds will be fixed in next release by adding flexible
miniflow extract optimization selection.
--- AVX-512 VBMI ISA is not yet supported in OVS due to requiring the
DPDK 20.11 update for RTE_CPUFLAG_*. Once on a newer DPDK this will
be added.

v2 updates:
- Includes DPIF command switching at runtime
- Includes AVX512 DPIF implementation
- Includes some partially related changes (can be split out of set?)
--- netdev output action optimization
--- avx512 dpcls 16-block support optimization


This patchset is a v7 for making the DPIF components of the
userspace datapath more flexible. It has been refactored to be
more modular to encourage code-reuse, and scalable in that ISA
optimized implementations can be added and selected at runtime.

The same approach as has been previously used for DPCLS is used
here, where a function pointer allows selection of an implementation
at runtime.

Datapath features such as EMC, SMC and HWOL are shared between
implementations, hence they are refactored into seperate header files.
The file splitting also improves maintainability, as dpif_netdev.c
has ~9000 LOC, and very hard to modify due to many structs defined
locally in the .c file, ruling out re-usability in other .c files.

Questions welcomed! Regards, -Harry


Cian Ferriter (1):
  docs/dpdk/bridge: Add dpif performance section.

Harry van Haar

[ovs-dev] [PATCH v6 14/15] dpdk: Cache result of CPU ISA checks

2020-12-08 Thread Harry van Haaren

As a small optimization, this patch caches the result of a CPU ISA
check from DPDK. Particularly in the case of running the DPCLS
autovalidator (which repeatedly probes subtables) this reduces
the amount of CPU ISA lookups from the DPDK level.

By caching them at the OVS/dpdk.c level, the ISA checks remain
runtime for the CPU where they are executed, but subsequent checks
for the same ISA feature become much cheaper.

Signed-off-by: Harry van Haaren 
Co-authored-by: Cian Ferriter 
Signed-off-by: Cian Ferriter 
---
 lib/dpdk.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394..703602603 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -614,13 +614,31 @@ print_dpdk_version(void)
 puts(rte_version());
 }
 
+/* Avoid calling rte_cpu_get_flag_enabled() excessively, by caching the
+ * result of the call for each CPU flag in a static variable. To avoid
+ * allocating large numbers of static variables, use a uint8 as a bitfield.
+ * Note the macro must only return if the ISA check is done and available.
+ */
+#define ISA_CHECK_DONE_BIT (1 << 0)
+#define ISA_AVAILABLE_BIT  (1 << 1)
+
 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG)   \
 do {\
 if (strncmp(feature, name_str, strlen(name_str)) == 0) {\
-int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
-VLOG_DBG("CPU flag %s, available %s\n", name_str,   \
-  has_isa ? "yes" : "no");  \
-return true;\
+static uint8_t isa_check_##RTE_CPUFLAG; \
+int check = isa_check_##RTE_CPUFLAG & ISA_CHECK_DONE_BIT;   \
+if (OVS_UNLIKELY(!check)) { \
+int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG);\
+VLOG_DBG("CPU flag %s, available %s\n", \
+ name_str, has_isa ? "yes" : "no"); \
+isa_check_##RTE_CPUFLAG = ISA_CHECK_DONE_BIT;   \
+if (has_isa) {  \
+isa_check_##RTE_CPUFLAG |= ISA_AVAILABLE_BIT;   \
+}   \
+}   \
+if (isa_check_##RTE_CPUFLAG & ISA_AVAILABLE_BIT) {  \
+return true;\
+}   \
 }   \
 } while (0)
 
-- 
2.25.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

1 2 3 >

1 - 100 of 294 matches

Mail list logo