This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn <[email protected]>
---
 lib/odp-execute-avx512.c  | 194 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |   1 +
 lib/odp-execute.c         |  21 ++++-
 3 files changed, 211 insertions(+), 5 deletions(-)

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index ede00b750..618fa37a7 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -22,6 +22,7 @@
 #include <config.h>
 #include <errno.h>
 
+#include "csum.h"
 #include "cpu.h"
 #include "dp-packet.h"
 #include "immintrin.h"
@@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp OVS_UNUSED, struct 
dp_packet_batch *batch,
     }
 }
 
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
+{
+    uint16_t tmp_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i oh = _mm256_loadu_si256((void *) old_header);
+    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
+    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract checksum value. */
+    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~tmp_checksum;
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_recalc_csum(__m256i res)
+{
+    uint32_t new_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract new checksum value. */
+    new_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~new_checksum;
+}
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
+static void
+action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
+                             struct dp_packet_batch *batch,
+                             const struct nlattr *a,
+                             bool should_steal OVS_UNUSED)
+{
+    a = nl_attr_get(a);
+    const struct ovs_key_ipv4 *key = nl_attr_get(a);
+    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
+    struct dp_packet *packet;
+    ovs_be16 old_csum;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ip_header *nh = dp_packet_l3(packet);
+        old_csum = nh->ip_csum;
+
+        __m256i v_key = _mm256_loadu_si256((void *) key);
+        __m256i v_mask = _mm256_loadu_si256((void *) mask);
+        __m256i v_packet = _mm256_loadu_si256((void *) nh);
+
+        /* Shuffle key and mask to match ip_header struct layout. */
+        static const uint8_t ip_shuffle_mask[32] = {
+            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
+            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
+                                             0x1, 0xF, 0xF, 0xF);
+
+        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
+
+        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
+        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
+
+        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
+        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
+
+        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
+        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
+
+        /* Update checksum. */
+        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
+
+        /* Insert new checksum. */
+        v_res = _mm256_insert_epi16(v_res, checksum, 5);
+
+       /* If ip_src or ip_dst has been modified, L4 checksum needs to
+        * be updated too.
+        */
+        int update_mask = _mm256_movemask_epi8(v_mask);
+        if (update_mask & 0xFF) {
+
+            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
+            tmp_checksum = ~tmp_checksum;
+            uint16_t csum;
+
+            if (nh->ip_proto == IPPROTO_UDP) {
+                /* New UDP checksum. */
+                struct udp_header *uh = dp_packet_l4(packet);
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+
+                    uint32_t udp_checksum = old_csum + tmp_checksum;
+                    udp_checksum = csum_finish(udp_checksum);
+                    uint16_t udp_csum = ~udp_checksum;
+
+                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
+
+                    csum =  csum_finish(nw_udp_checksum);
+
+                    /* Insert new udp checksum. */
+                    v_res = _mm256_insert_epi16(v_res, csum, 13);
+                }
+            }
+            if (nh->ip_proto == IPPROTO_TCP) {
+                /* New TCP checksum. */
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+
+                uint32_t tcp_checksum = old_csum + tmp_checksum;
+                tcp_checksum = csum_finish(tcp_checksum);
+                uint16_t tcp_csum = ~tcp_checksum;
+
+                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
+
+                csum =  csum_finish(nw_tcp_checksum);
+
+                th->tcp_csum = csum;
+            }
+        }
+
+        /* Store new IP header. */
+        _mm256_storeu_si256((void *) nh, v_res);
+    }
+}
+
 static void
 action_avx512_set_masked(void *dp OVS_UNUSED,
                          struct dp_packet_batch *batch OVS_UNUSED,
@@ -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
                             action_avx512_eth_set_addrs;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
+                            action_avx512_ipv4_set_addrs;
     active_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 34f13523a..cb77bab31 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -284,6 +284,7 @@ action_autoval_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_autoval_generic;
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked_init;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
     active_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 5c4dd8e33..cbf528f93 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct dp_packet *packet,
         break;
     }
 
-    case OVS_KEY_ATTR_IPV4:
-        odp_set_ipv4(packet, nl_attr_get(a),
-                     get_mask(a, struct ovs_key_ipv4));
-        break;
-
     case OVS_KEY_ATTR_IPV6:
         odp_set_ipv6(packet, nl_attr_get(a),
                      get_mask(a, struct ovs_key_ipv6));
@@ -657,6 +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
     case OVS_KEY_ATTR_ETHERNET:
     case OVS_KEY_ATTR_ETHERTYPE:
     case OVS_KEY_ATTR_IN_PORT:
+    case OVS_KEY_ATTR_IPV4:
     case OVS_KEY_ATTR_VLAN:
     case OVS_KEY_ATTR_ICMP:
     case OVS_KEY_ATTR_ICMPV6:
@@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct dp_packet_batch 
*batch,
     }
 }
 
+static void
+action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                const struct nlattr *a OVS_UNUSED,
+                bool should_steal OVS_UNUSED)
+{
+    a = nl_attr_get(a);
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        odp_set_ipv4(packet, nl_attr_get(a),
+                     get_mask(a, struct ovs_key_ipv4));
+    }
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -902,6 +912,7 @@ odp_action_scalar_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
     actions_active_impl = *self;
 
     return 0;
-- 
2.25.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to