Re: [ovs-dev] [PATCH v7 08/11] odp-execute: Add ISA implementation of pop_vlan action.

Eelco Chaudron Thu, 23 Jun 2022 08:38:11 -0700

On 14 Jun 2022, at 13:57, Emma Finn wrote:

> This commit adds the AVX512 implementation of the
> pop_vlan action.
>
> Signed-off-by: Emma Finn <[email protected]>
> ---
>  lib/odp-execute-avx512.c | 91 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 91 insertions(+)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 1fb334689..f9e2b1727 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -14,6 +14,11 @@
>   * limitations under the License.
>   */
>
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +
>  #include <config.h>
>  #include <errno.h>
>
> @@ -24,6 +29,86 @@
>  #include "odp-netlink.h"
>  #include "openvswitch/vlog.h"
>
> +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);


I'd asked to add comment so that anyone without avx512 knowledge can read and 
understand the code. Guess this is not fully accomplished. I've attached a diff 
at the end, with what I think would help people to understand what's going on.

Also you do not catch the issue if some one adds a member between l2_pad_size 
and l2_5_ofs. I'll add some code for this to my patch also.

> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
> +                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
> +                  offsetof(struct dp_packet, l3_ofs));
> +
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
> +                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
> +                           offsetof(struct dp_packet, l4_ofs));
> +

We also need a build assert to make sure we can safely read 128 bytes from 
l2_pad_size.
I will add it together with my comment changes below.

> +/* Adjust the size of the l2 portion of the dp_packet, updating the l2
> + * pointer and the layer offsets. The function will broadcast resize_by_bytes
> + * across a register and uses a kmask to identify which lanes should be
> + * incremented/decremented. Either an add or subtract will be performed
> + * and the result is stored back to the original packet. */
> +static inline void ALWAYS_INLINE
> +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
> +{
> +    /* Update packet size/data pointers */
> +    if (resize_by_bytes >= 0) {
> +        dp_packet_prealloc_headroom(b, resize_by_bytes);
> +    } else {
> +        ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
> +                    -resize_by_bytes);
> +    }
> +
> +    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
> +    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
> +

Please use the available macro's for this as suggested earlier to make the code 
more common:

@@ -48,15 +48,11 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int 
resize_by_bytes)
 {
     /* Update packet size/data pointers */
     if (resize_by_bytes >= 0) {
-        dp_packet_prealloc_headroom(b, resize_by_bytes);
+        dp_packet_push_uninit(b, resize_by_bytes);
     } else {
-        ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
-                    -resize_by_bytes);
+        dp_packet_pull(b, -resize_by_bytes);
     }

-    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
-    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
-

> +    const __m128i v_zeros = _mm_setzero_si128();
> +    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> +    const uint8_t k_lanes = 0b1110;
> +    __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));
> +
> +    /* Load 128 bits from the dp_packet structure starting at the l2_pad_size
> +     * offset. */
> +    void *adjust_ptr = &b->l2_pad_size;
> +    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
> +
> +    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
> +                                                v_u16_max);
> +
> +    __m128i v_adjust_wip;
> +
> +    if (resize_by_bytes >= 0) {
> +        v_adjust_wip = _mm_mask_add_epi16(v_adjust_src, k_cmp,
> +                                          v_adjust_src, v_offset);
> +    } else {
> +        v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
> +                                          v_adjust_src, v_offset);
> +    }
> +
> +    _mm_storeu_si128(adjust_ptr, v_adjust_wip);
> +}
> +
> +/* This function will load the entire vlan_eth_header into a 128-bit wide
> + * register. Then use an 8-byte realign to shift the header right by 12 bytes
> + * to remove the vlan header and store the results back to the orginal 
> header.
> + */
> +static void
> +action_avx512_pop_vlan(struct dp_packet_batch *batch,
> +                       const struct nlattr *a OVS_UNUSED)
> +{
> +    struct dp_packet *packet;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct vlan_eth_header *veh = dp_packet_eth(packet);
> +
> +        if (veh && dp_packet_size(packet) >= sizeof *veh &&
> +            eth_type_vlan(veh->veth_type)) {
> +
> +            __m128i v_ether = _mm_loadu_si128((void *) veh);
> +            __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),

As I'm not a AVX expert, but would it be more effective to load a register with 
_mm_setzero_si128(), and reuse it in every iteration?

> +                                                16 - VLAN_HEADER_LEN);

I think we should replace the 16 with sizeof(__m128i) as it give an idea what 
it represents, i.e. number of bits from b.

> +            _mm_storeu_si128((void *) veh, v_realign);
> +            avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
> +        }
> +    }
> +}
> +
>  /* Probe functions to check ISA requirements. */
>  static bool
>  avx512_isa_probe(void)
> @@ -52,5 +137,11 @@ action_avx512_init(struct odp_execute_action_impl *self)
>          return -ENOTSUP;
>      }
>
> +    /* Set function pointers for actions that can be applied directly, these
> +     * are identified by OVS_ACTION_ATTR_*. */
> +    self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
>      return 0;
>  }
> +
> +#endif
> +#endif
> -- 
> 2.32.0

Here is my full diff with the enhanced comments:


diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 473d2c666..9b68ee6b3 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -30,6 +30,15 @@
 #include "openvswitch/vlog.h"

 VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+
+/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs
+ * fields remain in the same order and offset to l2_padd_size. This is needed
+ * as the avx512_dp_packet_resize_l2() function will manipulate those fields at
+ * a fixed memory index based on the l2_padd_size offset. */
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) +
+                  MEMBER_SIZEOF(struct dp_packet, l2_pad_size) ==
+                  offsetof(struct dp_packet, l2_5_ofs));
+
 BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
                   MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
                   offsetof(struct dp_packet, l3_ofs));
@@ -38,39 +47,61 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
                            MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
                            offsetof(struct dp_packet, l4_ofs));

-/* Adjust the size of the l2 portion of the dp_packet, updating the l2
- * pointer and the layer offsets. The function will broadcast resize_by_bytes
- * across a register and uses a kmask to identify which lanes should be
- * incremented/decremented. Either an add or subtract will be performed
- * and the result is stored back to the original packet. */
+/* The below build assert makes sure it's safe to read/write 128-bits starting
+ * at the l2_pad_size location. */
+BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
+                  offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));
+
+
 static inline void ALWAYS_INLINE
 avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
 {
-    /* Update packet size/data pointers */
+    /* Update packet size/data pointers, same as the scalar implementation. */
     if (resize_by_bytes >= 0) {
-        dp_packet_prealloc_headroom(b, resize_by_bytes);
+        dp_packet_push_uninit(b, resize_by_bytes);
     } else {
-        ovs_assert(dp_packet_size(b) - dp_packet_l2_pad_size(b) >=
-                    -resize_by_bytes);
+        dp_packet_pull(b, -resize_by_bytes);
     }

-    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
-    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
+    /* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which
+     * the scalar implementation does with the  dp_packet_adjust_layer_offset()
+     * function. */

+    /* Set the v_zero register to all zero's. */
     const __m128i v_zeros = _mm_setzero_si128();
+
+    /* Set the v_u16_max register to all one's. */
     const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);

+    /* Each lane represents 16 bits in a 12- bit register. In this case the
+     * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and
+     * l4_ofs fields. */
     const uint8_t k_lanes = 0b1110;
+
+    /* Set all 16-bit words in the 128-bits v_offset register to the value we
+     * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */
     __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes));

-    /* Load 128 bits from the dp_packet structure starting at the l2_pad_size
+    /* Load 128-bits from the dp_packet structure starting at the l2_pad_size
      * offset. */
     void *adjust_ptr = &b->l2_pad_size;
     __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);

+    /* Here is the tricky part, we only need to update the value of the three
+     * fields if they are not UINT16_MAX. The following function will return
+     * a mask of lanes (read fields) that are not UINT16_MAX. It will do this
+     * by comparing only the lanes we requested, k_lanes, and if they match
+     * v_u16_max, the bit will be set. */
     __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
                                                 v_u16_max);

+    /* Based on the bytes adjust (positive, or negative) it will do the actual
+     * add or subtraction. These functions will only operate on the lanes
+     * (fields) requested based on k_cmp, i.e:
+     *   k_cmp = [l2_5_ofs, l3_ofs, l4_ofs]
+     *   for field in kcmp
+     *       v_adjust_src[field] = v_adjust_src[field] + v_offset
+     */
     __m128i v_adjust_wip;

     if (resize_by_bytes >= 0) {
@@ -81,13 +112,12 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int 
resize_by_bytes)
                                           v_adjust_src, v_offset);
     }

+    /* Here we write back the full 128-bits. */
     _mm_storeu_si128(adjust_ptr, v_adjust_wip);
 }

-/* This function will load the entire vlan_eth_header into a 128-bit wide
- * register. Then use an 8-byte realign to shift the header right by 12 bytes
- * to remove the vlan header and store the results back to the orginal header.
- */
+/* This function performance the same operation on each packet in the batch as
+ * the scalar eth_pop_vlan() function. */
 static void
 action_avx512_pop_vlan(struct dp_packet_batch *batch,
                        const struct nlattr *a OVS_UNUSED)
@@ -100,10 +130,25 @@ action_avx512_pop_vlan(struct dp_packet_batch *batch,
         if (veh && dp_packet_size(packet) >= sizeof *veh &&
             eth_type_vlan(veh->veth_type)) {

+            /* Load the first 128-bits of l2 header into the v_ether register.
+             * This result in the veth_dst/src and veth_type/tci of the
+             * vlan_eth_header structure to be loaded. */
             __m128i v_ether = _mm_loadu_si128((void *) veh);
+
+            /* This creates a 256-bit value containing the first four fields
+             * of the vlan_eth_header plus 128 zero-bit. The result will be the
+             * lowest 128-bits after the right shift, hence we shift the data
+             * 128(zero)-bits minus the VLAN_HEADER_LEN, so we are left with
+             * only the veth_dst and veth_src fields. */
             __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
-                                                16 - VLAN_HEADER_LEN);
+                                                sizeof(__m128i) -
+                                                VLAN_HEADER_LEN);
+
+            /* Write back the modified ethernet header. */
             _mm_storeu_si128((void *) veh, v_realign);
+
+            /* As we removed the VLAN_HEADER we now need to adjust all the
+             * offsets. */
             avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
         }
     }

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v7 08/11] odp-execute: Add ISA implementation of pop_vlan action.

Reply via email to