On 14 Jun 2022, at 13:57, Emma Finn wrote:
> This commit adds the AVX512 implementation of the
> pop_vlan action.
>
> Signed-off-by: Emma Finn <[email protected]>
> ---
> lib/odp-execute-avx512.c | 91 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 91 insertions(+)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 1fb334689..f9e2b1727 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -14,6 +14,11 @@
> * limitations under the License.
> */
>
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +
> #include <config.h>
> #include <errno.h>
>
> @@ -24,6 +29,86 @@
> #include "odp-netlink.h"
> #include "openvswitch/vlog.h"
>
> +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
I'd asked to add comment so that anyone without avx512 knowledge can read and
understand the code. Guess this is not fully accomplished. I've attached a diff
at the end, with what I think would help people to understand what's going on.
Also you do not catch the issue if some one adds a member between l2_pad_size
and l2_5_ofs. I'll add some code for this to my patch also.
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
> + MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
> + offsetof(struct dp_packet, l3_ofs));
> +
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
> + MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
> + offsetof(struct dp_packet, l4_ofs));
> +
We also need a build assert to make sure we can safely read 128 bytes from
l2_pad_size.
I will add it together with my comment changes below.
> +/* Adjust the size of the l2 portion of the dp_packet, updating the l2
> + * pointer and the layer offsets. The function will broadcast resize_by_bytes
> + * across a register and uses a kmask to identify which lanes should be
> + * incremented/decremented. Either an add or subtract will be performed
> + * and the result is stored back to the original packet. */
> +static inline void ALWAYS_INLINE
> +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
> +{
> + /* Update packet size/data pointers */
> + if (resize_by_bytes >= 0) {
> + dp_packet_prealloc_headroom(b, resize_by_bytes);
> + } else {
> + ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
> + -resize_by_bytes);
> + }
> +
> + dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
> + dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
> +
Please use the available macro's for this as suggested earlier to make the code
more common:
@@ -48,15 +48,11 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int
resize_by_bytes)
{
/* Update packet size/data pointers */
if (resize_by_bytes >= 0) {
- dp_packet_prealloc_headroom(b, resize_by_bytes);
+ dp_packet_push_uninit(b, resize_by_bytes);
} else {
- ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
- -resize_by_bytes);
+ dp_packet_pull(b, -resize_by_bytes);
}
- dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
- dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
-
> + const __m128i v_zeros = _mm_setzero_si128();
> + const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> + const uint8_t k_lanes = 0b1110;
> + __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));
> +
> + /* Load 128 bits from the dp_packet structure starting at the l2_pad_size
> + * offset. */
> + void *adjust_ptr = &b->l2_pad_size;
> + __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
> +
> + __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
> + v_u16_max);
> +
> + __m128i v_adjust_wip;
> +
> + if (resize_by_bytes >= 0) {
> + v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp,
> + v_adjust_src, v_offset);
> + } else {
> + v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
> + v_adjust_src, v_offset);
> + }
> +
> + _mm_storeu_si128(adjust_ptr, v_adjust_wip);
> +}
> +
> +/* This function will load the entire vlan_eth_header into a 128-bit wide
> + * register. Then use an 8-byte realign to shift the header right by 12 bytes
> + * to remove the vlan header and store the results back to the orginal
> header.
> + */
> +static void
> +action_avx512_pop_vlan(struct dp_packet_batch *batch,
> + const struct nlattr *a OVS_UNUSED)
> +{
> + struct dp_packet *packet;
> +
> + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> + struct vlan_eth_header *veh = dp_packet_eth(packet);
> +
> + if (veh && dp_packet_size(packet) >= sizeof *veh &&
> + eth_type_vlan(veh->veth_type)) {
> +
> + __m128i v_ether = _mm_loadu_si128((void *) veh);
> + __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
As I'm not a AVX expert, but would it be more effective to load a register with
_mm_setzero_si128(), and reuse it in every iteration?
> + 16 - VLAN_HEADER_LEN);
I think we should replace the 16 with sizeof(__m128i) as it give an idea what
it represents, i.e. number of bits from b.
> + _mm_storeu_si128((void *) veh, v_realign);
> + avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
> + }
> + }
> +}
> +
> /* Probe functions to check ISA requirements. */
> static bool
> avx512_isa_probe(void)
> @@ -52,5 +137,11 @@ action_avx512_init(struct odp_execute_action_impl *self)
> return -ENOTSUP;
> }
>
> + /* Set function pointers for actions that can be applied directly, these
> + * are identified by OVS_ACTION_ATTR_*. */
> + self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
> return 0;
> }
> +
> +#endif
> +#endif
> --
> 2.32.0
Here is my full diff with the enhanced comments:
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 473d2c666..9b68ee6b3 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -30,6 +30,15 @@
#include "openvswitch/vlog.h"
VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+
+/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs
+ * fields remain in the same order and offset to l2_padd_size. This is needed
+ * as the avx512_dp_packet_resize_l2() function will manipulate those fields at
+ * a fixed memory index based on the l2_padd_size offset. */
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) +
+ MEMBER_SIZEOF(struct dp_packet, l2_pad_size) ==
+ offsetof(struct dp_packet, l2_5_ofs));
+
BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
offsetof(struct dp_packet, l3_ofs));
@@ -38,39 +47,61 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
offsetof(struct dp_packet, l4_ofs));
-/* Adjust the size of the l2 portion of the dp_packet, updating the l2
- * pointer and the layer offsets. The function will broadcast resize_by_bytes
- * across a register and uses a kmask to identify which lanes should be
- * incremented/decremented. Either an add or subtract will be performed
- * and the result is stored back to the original packet. */
+/* The below build assert makes sure it's safe to read/write 128-bits starting
+ * at the l2_pad_size location. */
+BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
+ offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
+
+
static inline void ALWAYS_INLINE
avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
{
- /* Update packet size/data pointers */
+ /* Update packet size/data pointers, same as the scalar implementation. */
if (resize_by_bytes >= 0) {
- dp_packet_prealloc_headroom(b, resize_by_bytes);
+ dp_packet_push_uninit(b, resize_by_bytes);
} else {
- ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
- -resize_by_bytes);
+ dp_packet_pull(b, -resize_by_bytes);
}
- dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
- dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
+ /* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which
+ * the scalar implementation does with the dp_packet_adjust_layer_offset()
+ * function. */
+ /* Set the v_zero register to all zero's. */
const __m128i v_zeros = _mm_setzero_si128();
+
+ /* Set the v_u16_max register to all one's. */
const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
+ /* Each lane represents 16 bits in a 12- bit register. In this case the
+ * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and
+ * l4_ofs fields. */
const uint8_t k_lanes = 0b1110;
+
+ /* Set all 16-bit words in the 128-bits v_offset register to the value we
+ * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */
__m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));
- /* Load 128 bits from the dp_packet structure starting at the l2_pad_size
+ /* Load 128-bits from the dp_packet structure starting at the l2_pad_size
* offset. */
void *adjust_ptr = &b->l2_pad_size;
__m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
+ /* Here is the tricky part, we only need to update the value of the three
+ * fields if they are not UINT16_MAX. The following function will return
+ * a mask of lanes (read fields) that are not UINT16_MAX. It will do this
+ * by comparing only the lanes we requested, k_lanes, and if they match
+ * v_u16_max, the bit will be set. */
__mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
v_u16_max);
+ /* Based on the bytes adjust (positive, or negative) it will do the actual
+ * add or subtraction. These functions will only operate on the lanes
+ * (fields) requested based on k_cmp, i.e:
+ * k_cmp = [l2_5_ofs, l3_ofs, l4_ofs]
+ * for field in kcmp
+ * v_adjust_src[field] = v_adjust_src[field] + v_offset
+ */
__m128i v_adjust_wip;
if (resize_by_bytes >= 0) {
@@ -81,13 +112,12 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int
resize_by_bytes)
v_adjust_src, v_offset);
}
+ /* Here we write back the full 128-bits. */
_mm_storeu_si128(adjust_ptr, v_adjust_wip);
}
-/* This function will load the entire vlan_eth_header into a 128-bit wide
- * register. Then use an 8-byte realign to shift the header right by 12 bytes
- * to remove the vlan header and store the results back to the orginal header.
- */
+/* This function performance the same operation on each packet in the batch as
+ * the scalar eth_pop_vlan() function. */
static void
action_avx512_pop_vlan(struct dp_packet_batch *batch,
const struct nlattr *a OVS_UNUSED)
@@ -100,10 +130,25 @@ action_avx512_pop_vlan(struct dp_packet_batch *batch,
if (veh && dp_packet_size(packet) >= sizeof *veh &&
eth_type_vlan(veh->veth_type)) {
+ /* Load the first 128-bits of l2 header into the v_ether register.
+ * This result in the veth_dst/src and veth_type/tci of the
+ * vlan_eth_header structure to be loaded. */
__m128i v_ether = _mm_loadu_si128((void *) veh);
+
+ /* This creates a 256-bit value containing the first four fields
+ * of the vlan_eth_header plus 128 zero-bit. The result will be the
+ * lowest 128-bits after the right shift, hence we shift the data
+ * 128(zero)-bits minus the VLAN_HEADER_LEN, so we are left with
+ * only the veth_dst and veth_src fields. */
__m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
- 16 - VLAN_HEADER_LEN);
+ sizeof(__m128i) -
+ VLAN_HEADER_LEN);
+
+ /* Write back the modified ethernet header. */
_mm_storeu_si128((void *) veh, v_realign);
+
+ /* As we removed the VLAN_HEADER we now need to adjust all the
+ * offsets. */
avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
}
}
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev