This patch supports RxDID #22 by the following changes:
-add structure and macro definition for RxDID #22,
-support RxDID #22 format in normal path,
-change RSS hash parsing from RxDID #22 in AVX/SSE data path.

Signed-off-by: Junyu Jiang <junyux.ji...@intel.com>
---
 drivers/net/ice/ice_rxtx.c          | 16 ++---
 drivers/net/ice/ice_rxtx.h          | 42 +++++++++++++
 drivers/net/ice/ice_rxtx_vec_avx2.c | 98 +++++++++++++++++++++++++++--
 drivers/net/ice/ice_rxtx_vec_sse.c  | 89 +++++++++++++++++++++-----
 4 files changed, 218 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 2e1f06d2c..a31a976a1 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -50,7 +50,7 @@ static inline uint8_t
 ice_proto_xtr_type_to_rxdid(uint8_t xtr_type)
 {
        static uint8_t rxdid_map[] = {
-               [PROTO_XTR_NONE]      = ICE_RXDID_COMMS_GENERIC,
+               [PROTO_XTR_NONE]      = ICE_RXDID_COMMS_OVS,
                [PROTO_XTR_VLAN]      = ICE_RXDID_COMMS_AUX_VLAN,
                [PROTO_XTR_IPV4]      = ICE_RXDID_COMMS_AUX_IPV4,
                [PROTO_XTR_IPV6]      = ICE_RXDID_COMMS_AUX_IPV6,
@@ -59,7 +59,7 @@ ice_proto_xtr_type_to_rxdid(uint8_t xtr_type)
        };
 
        return xtr_type < RTE_DIM(rxdid_map) ?
-                               rxdid_map[xtr_type] : ICE_RXDID_COMMS_GENERIC;
+                               rxdid_map[xtr_type] : ICE_RXDID_COMMS_OVS;
 }
 
 static enum ice_status
@@ -72,7 +72,7 @@ ice_program_hw_rx_queue(struct ice_rx_queue *rxq)
        enum ice_status err;
        uint16_t buf_size, len;
        struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
-       uint32_t rxdid = ICE_RXDID_COMMS_GENERIC;
+       uint32_t rxdid = ICE_RXDID_COMMS_OVS;
        uint32_t regval;
 
        /* Set buffer size as the head split is disabled. */
@@ -1309,7 +1309,7 @@ ice_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union 
ice_rx_flex_desc *rxdp)
 
 static void
 ice_rxd_to_proto_xtr(struct rte_mbuf *mb,
-                    volatile struct ice_32b_rx_flex_desc_comms *desc)
+                    volatile struct ice_32b_rx_flex_desc_comms_ovs *desc)
 {
        uint16_t stat_err = rte_le_to_cpu_16(desc->status_error1);
        uint32_t metadata;
@@ -1338,8 +1338,9 @@ static inline void
 ice_rxd_to_pkt_fields(struct rte_mbuf *mb,
                      volatile union ice_rx_flex_desc *rxdp)
 {
-       volatile struct ice_32b_rx_flex_desc_comms *desc =
-                       (volatile struct ice_32b_rx_flex_desc_comms *)rxdp;
+       volatile struct ice_32b_rx_flex_desc_comms_ovs *desc =
+                       (volatile struct ice_32b_rx_flex_desc_comms_ovs *)rxdp;
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
        uint16_t stat_err;
 
        stat_err = rte_le_to_cpu_16(desc->status_error0);
@@ -1347,13 +1348,14 @@ ice_rxd_to_pkt_fields(struct rte_mbuf *mb,
                mb->ol_flags |= PKT_RX_RSS_HASH;
                mb->hash.rss = rte_le_to_cpu_32(desc->rss_hash);
        }
+#endif
 
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
        if (desc->flow_id != 0xFFFFFFFF) {
                mb->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
                mb->hash.fdir.hi = rte_le_to_cpu_32(desc->flow_id);
        }
 
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
        if (unlikely(rte_net_ice_dynf_proto_xtr_metadata_avail()))
                ice_rxd_to_proto_xtr(mb, desc);
 #endif
diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 2fdcfb7d0..e21ba152d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -38,6 +38,8 @@
 
 #define ICE_FDIR_PKT_LEN       512
 
+#define ICE_RXDID_COMMS_OVS    22
+
 typedef void (*ice_rx_release_mbufs_t)(struct ice_rx_queue *rxq);
 typedef void (*ice_tx_release_mbufs_t)(struct ice_tx_queue *txq);
 
@@ -135,6 +137,46 @@ union ice_tx_offload {
        };
 };
 
+/* Rx Flex Descriptor for Comms Package Profile
+ * RxDID Profile ID 22 (swap Hash and FlowID)
+ * Flex-field 0: Flow ID lower 16-bits
+ * Flex-field 1: Flow ID upper 16-bits
+ * Flex-field 2: RSS hash lower 16-bits
+ * Flex-field 3: RSS hash upper 16-bits
+ * Flex-field 4: AUX0
+ * Flex-field 5: AUX1
+ */
+struct ice_32b_rx_flex_desc_comms_ovs {
+       /* Qword 0 */
+       u8 rxdid;
+       u8 mir_id_umb_cast;
+       __le16 ptype_flexi_flags0;
+       __le16 pkt_len;
+       __le16 hdr_len_sph_flex_flags1;
+
+       /* Qword 1 */
+       __le16 status_error0;
+       __le16 l2tag1;
+       __le32 flow_id;
+
+       /* Qword 2 */
+       __le16 status_error1;
+       u8 flexi_flags2;
+       u8 ts_low;
+       __le16 l2tag2_1st;
+       __le16 l2tag2_2nd;
+
+       /* Qword 3 */
+       __le32 rss_hash;
+       union {
+               struct {
+                       __le16 aux0;
+                       __le16 aux1;
+               } flex;
+               __le32 ts_high;
+       } flex_ts;
+};
+
 int ice_rx_queue_setup(struct rte_eth_dev *dev,
                       uint16_t queue_idx,
                       uint16_t nb_desc,
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c 
b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..07d129e3f 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -191,8 +191,8 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
        const __m256i shuf_msk =
                _mm256_set_epi8
                        (/* first descriptor */
-                        15, 14,
-                        13, 12,        /* octet 12~15, 32 bits rss */
+                        0xFF, 0xFF,
+                        0xFF, 0xFF,    /* rss hash parsed separately */
                         11, 10,        /* octet 10~11, 16 bits vlan_macip */
                         5, 4,          /* octet 4~5, 16 bits data_len */
                         0xFF, 0xFF,    /* skip hi 16 bits pkt_len, zero out */
@@ -200,8 +200,8 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
                         0xFF, 0xFF,    /* pkt_type set as unknown */
                         0xFF, 0xFF,    /*pkt_type set as unknown */
                         /* second descriptor */
-                        15, 14,
-                        13, 12,        /* octet 12~15, 32 bits rss */
+                        0xFF, 0xFF,
+                        0xFF, 0xFF,    /* rss hash parsed separately */
                         11, 10,        /* octet 10~11, 16 bits vlan_macip */
                         5, 4,          /* octet 4~5, 16 bits data_len */
                         0xFF, 0xFF,    /* skip hi 16 bits pkt_len, zero out */
@@ -461,6 +461,96 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
                /* merge flags */
                const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
                                rss_vlan_flags);
+
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
+               /**
+                * needs to load 2nd 16B of each desc for RSS hash parsing,
+                * will cause performance drop to get into this context.
+                */
+               if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+                               DEV_RX_OFFLOAD_RSS_HASH) {
+                       /* load bottom half of every 32B desc */
+                       const __m128i raw_desc_bh7 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[7].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh6 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[6].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh5 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[5].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh4 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[4].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh3 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[3].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh2 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[2].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh1 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[1].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh0 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[0].wb.status_error1));
+
+                       __m256i raw_desc_bh6_7 =
+                               _mm256_inserti128_si256
+                                       (_mm256_castsi128_si256(raw_desc_bh6),
+                                       raw_desc_bh7, 1);
+                       __m256i raw_desc_bh4_5 =
+                               _mm256_inserti128_si256
+                                       (_mm256_castsi128_si256(raw_desc_bh4),
+                                       raw_desc_bh5, 1);
+                       __m256i raw_desc_bh2_3 =
+                               _mm256_inserti128_si256
+                                       (_mm256_castsi128_si256(raw_desc_bh2),
+                                       raw_desc_bh3, 1);
+                       __m256i raw_desc_bh0_1 =
+                               _mm256_inserti128_si256
+                                       (_mm256_castsi128_si256(raw_desc_bh0),
+                                       raw_desc_bh1, 1);
+
+                       /**
+                        * to shift the 32b RSS hash value to the
+                        * highest 32b of each 128b before mask
+                        */
+                       __m256i rss_hash6_7 =
+                               _mm256_slli_epi64(raw_desc_bh6_7, 32);
+                       __m256i rss_hash4_5 =
+                               _mm256_slli_epi64(raw_desc_bh4_5, 32);
+                       __m256i rss_hash2_3 =
+                               _mm256_slli_epi64(raw_desc_bh2_3, 32);
+                       __m256i rss_hash0_1 =
+                               _mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+                       __m256i rss_hash_msk =
+                               _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+                                                0xFFFFFFFF, 0, 0, 0);
+
+                       rss_hash6_7 = _mm256_and_si256
+                                       (rss_hash6_7, rss_hash_msk);
+                       rss_hash4_5 = _mm256_and_si256
+                                       (rss_hash4_5, rss_hash_msk);
+                       rss_hash2_3 = _mm256_and_si256
+                                       (rss_hash2_3, rss_hash_msk);
+                       rss_hash0_1 = _mm256_and_si256
+                                       (rss_hash0_1, rss_hash_msk);
+
+                       mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+                       mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+                       mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+                       mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+               } /* if() on RSS hash parsing */
+#endif
                /**
                 * At this point, we have the 8 sets of flags in the low 16-bits
                 * of each 32-bit value in vlan0.
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c 
b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..fffb27138 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -230,7 +230,8 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
        const __m128i zero = _mm_setzero_si128();
        /* mask to shuffle from desc. to mbuf */
        const __m128i shuf_msk = _mm_set_epi8
-                       (15, 14, 13, 12,  /* octet 12~15, 32 bits rss */
+                       (0xFF, 0xFF,
+                        0xFF, 0xFF,  /* rss hash parsed separately */
                         11, 10,      /* octet 10~11, 16 bits vlan_macip */
                         5, 4,        /* octet 4~5, 16 bits data_len */
                         0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
@@ -321,7 +322,7 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
             pos += ICE_DESCS_PER_LOOP,
             rxdp += ICE_DESCS_PER_LOOP) {
                __m128i descs[ICE_DESCS_PER_LOOP];
-               __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+               __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
                __m128i staterr, sterr_tmp1, sterr_tmp2;
                /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
                __m128i mbp1;
@@ -367,8 +368,12 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
                rte_compiler_barrier();
 
                /* D.1 pkt 3,4 convert format from desc to pktmbuf */
-               pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
-               pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
+               pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk);
+               pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk);
+
+               /* D.1 pkt 1,2 convert format from desc to pktmbuf */
+               pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk);
+               pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk);
 
                /* C.1 4=>2 filter staterr info only */
                sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
@@ -378,12 +383,68 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
                ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
 
                /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
-               pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
                pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
+               pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
 
-               /* D.1 pkt 1,2 convert format from desc to pktmbuf */
-               pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
-               pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
+               /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+               pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+               pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust);
+
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
+               /**
+                * needs to load 2nd 16B of each desc for RSS hash parsing,
+                * will cause performance drop to get into this context.
+                */
+               if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+                               DEV_RX_OFFLOAD_RSS_HASH) {
+                       /* load bottom half of every 32B desc */
+                       const __m128i raw_desc_bh3 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[3].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh2 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[2].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh1 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[1].wb.status_error1));
+                       rte_compiler_barrier();
+                       const __m128i raw_desc_bh0 =
+                               _mm_load_si128
+                                       ((void *)(&rxdp[0].wb.status_error1));
+
+                       /**
+                        * to shift the 32b RSS hash value to the
+                        * highest 32b of each 128b before mask
+                        */
+                       __m128i rss_hash3 =
+                               _mm_slli_epi64(raw_desc_bh3, 32);
+                       __m128i rss_hash2 =
+                               _mm_slli_epi64(raw_desc_bh2, 32);
+                       __m128i rss_hash1 =
+                               _mm_slli_epi64(raw_desc_bh1, 32);
+                       __m128i rss_hash0 =
+                               _mm_slli_epi64(raw_desc_bh0, 32);
+
+                       __m128i rss_hash_msk =
+                               _mm_set_epi32(0xFFFFFFFF, 0, 0, 0);
+
+                       rss_hash3 = _mm_and_si128
+                                       (rss_hash3, rss_hash_msk);
+                       rss_hash2 = _mm_and_si128
+                                       (rss_hash2, rss_hash_msk);
+                       rss_hash1 = _mm_and_si128
+                                       (rss_hash1, rss_hash_msk);
+                       rss_hash0 = _mm_and_si128
+                                       (rss_hash0, rss_hash_msk);
+
+                       pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3);
+                       pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2);
+                       pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1);
+                       pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0);
+               } /* if() on RSS hash parsing */
+#endif
 
                /* C.2 get 4 pkts staterr value  */
                staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
@@ -391,14 +452,10 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
                /* D.3 copy final 3,4 data to rx_pkts */
                _mm_storeu_si128
                        ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
-                        pkt_mb4);
+                        pkt_mb3);
                _mm_storeu_si128
                        ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
-                        pkt_mb3);
-
-               /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
-               pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
-               pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+                        pkt_mb2);
 
                /* C* extract and record EOP bit */
                if (split_packet) {
@@ -422,9 +479,9 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
                /* D.3 copy final 1,2 data to rx_pkts */
                _mm_storeu_si128
                        ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
-                        pkt_mb2);
+                        pkt_mb1);
                _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
-                                pkt_mb1);
+                                pkt_mb0);
                ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
                /* C.4 calc avaialbe number of desc */
                var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
-- 
2.17.1

Reply via email to