From: Keegan Freyhof <[email protected]> Fixed some dead code and some variable names to make them more descriptive. Also optimized the V3 path to stay under 16 registers.
Signed-off-by: Keegan Freyhof <[email protected]> Signed-off-by: Mohammad Shuab Siddique <[email protected]> --- drivers/net/bnxt/bnxt_rxtx_vec_avx2.c | 240 +++++++++++------------- drivers/net/bnxt/bnxt_rxtx_vec_common.h | 2 +- 2 files changed, 114 insertions(+), 128 deletions(-) diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c index 46b51b20e4..5e22b4fc11 100644 --- a/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c +++ b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c @@ -187,15 +187,12 @@ recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); flags_type = _mm256_unpacklo_epi64(t0, t1); + flags2 = _mm256_unpackhi_epi64(t0, t1); ptype_idx = _mm256_and_si256(flags_type, flags_type_mask); ptype_idx = _mm256_srli_epi32(ptype_idx, RX_PKT_CMPL_FLAGS_ITYPE_SFT - BNXT_PTYPE_TBL_TYPE_SFT); - t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); - t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); - flags2 = _mm256_unpackhi_epi64(t0, t1); - t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask1), RX_PKT_CMPL_FLAGS2_META_FORMAT_SFT - BNXT_PTYPE_TBL_VLAN_SFT); @@ -251,9 +248,6 @@ recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) * bits and count the number of set bits in order to determine * the number of valid descriptors. */ - const __m256i perm_msk = - _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk); info3_v = _mm256_and_si256(errors_v2, info3_v_mask); info3_v = _mm256_xor_si256(info3_v, valid_target); @@ -904,7 +898,6 @@ bnxt_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, return nb_sent; } - /* * V3 (Thor2) RX burst processing - AVX2 vectorized implementation * @@ -924,7 +917,6 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size; struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; uint64_t valid, desc_valid_mask = ~0ULL; - const __m256i info3_v_mask = _mm256_set1_epi32(CMPL_BASE_V); uint32_t raw_cons = cpr->cp_raw_cons; uint32_t cons, mbcons; int nb_rx_pkts = 0; @@ -937,12 +929,12 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk */ const __m256i shuf_msk = _mm256_set_epi8(15, 14, 13, 12, /* rss */ - 0xFF, 0xFF, /* vlan_tci (filled separately) */ + 11, 10, /* vlan_tci */ 3, 2, /* data_len */ 0xFF, 0xFF, 3, 2, /* pkt_len */ 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ 15, 14, 13, 12, /* rss */ - 0xFF, 0xFF, /* vlan_tci (filled separately) */ + 11, 10, /* vlan_tci */ 3, 2, /* data_len */ 0xFF, 0xFF, 3, 2, /* pkt_len */ 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */ @@ -952,40 +944,24 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk _mm256_set_epi8(0xff, 0xff, 0xff, 0xff, /* Zeroes */ 11, 10, /* metadata0 (vlan_tci) */ 9, 8, /* errors_v2 */ - 5, 4, /* metadata1 (payload_offset) */ + 5, 4, /* metadata2 */ 1, 0, /* flags2 low */ 0xff, 0xff, 0xff, 0xff, /* Zeroes */ 0xff, 0xff, 0xff, 0xff, /* Zeroes */ 11, 10, /* metadata0 (vlan_tci) */ 9, 8, /* errors_v2 */ - 5, 4, /* metadata1 (payload_offset) */ + 5, 4, /* metadata2 */ 1, 0, /* flags2 low */ 0xff, 0xff, 0xff, 0xff); /* Zeroes */ + const __m256i mask_1s = + _mm256_set1_epi32(0x1); + const __m256i mask_fs = + _mm256_set1_epi32(0xf); - const __m256i flags_type_mask = - _mm256_set1_epi32(RX_PKT_V3_CMPL_FLAGS_ITYPE_MASK); - const __m256i flags2_ip_type_mask = - _mm256_set1_epi32(RX_PKT_V3_CMPL_HI_FLAGS2_IP_TYPE); - const __m256i rss_mask = - _mm256_set1_epi32(RX_PKT_V3_CMPL_FLAGS_RSS_VALID); - const __m256i metadata1_valid_mask = - _mm256_set1_epi32(RX_PKT_V3_CMPL_METADATA1_VALID); - const __m256i vlan_tci_mask = - _mm256_set1_epi32(RX_PKT_V3_CMPL_HI_METADATA0_VID_MASK | - RX_PKT_V3_CMPL_HI_METADATA0_DE | - RX_PKT_V3_CMPL_HI_METADATA0_PRI_MASK); - const __m256i cs_err_mask = - _mm256_set1_epi32(RX_PKT_CMPL_ERRORS_T_L4_CS_ERROR | - RX_PKT_CMPL_ERRORS_T_IP_CS_ERROR | - RX_PKT_CMPL_ERRORS_L4_CS_ERROR | - RX_PKT_CMPL_ERRORS_IP_CS_ERROR); - const __m256i cs_calc_mask = - _mm256_set1_epi32(RX_PKT_CMPL_CALC); - - __m256i t0, t1, flags_type, flags2, errors, metadata1; - __m256i ptype_idx, ptypes, vlan_tci, vlan_flags; - __m256i mbuf01, mbuf23, mbuf45, mbuf67; __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, rearm6, rearm7; + __m256i t0, t1, flags_type, flags2, errors_csum_idx, metadata1; + __m256i mbuf01, mbuf23, mbuf45, mbuf67; + __m256i ptype_idx, ptypes, vlan_flags; __m256i ol_flags, ol_flags_hi; __m256i rss_flags; @@ -1025,7 +1001,8 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk mbcons += BNXT_RX_DESCS_PER_LOOP_VEC256) { __m256i desc0, desc1, desc2, desc3, desc4, desc5, desc6, desc7; __m256i rxcmp0_1, rxcmp2_3, rxcmp4_5, rxcmp6_7, info3_v; - __m256i errors_v2, meta0_err, cs_calc, cs_valid; + __m256i md1_0123, lo2_3, md1_4567, lo6_7; + __m256i errors_v2, cs_calc, cs_valid; uint32_t num_valid; t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons]); @@ -1057,119 +1034,134 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk /* * Pack needed fields from each descriptor pair. - * For V3: extract rxcmp (low) for flags_type, len, rss + * extract rxcmp (low) for flags_type, len, rss * and rxcmp1 (hi) for flags2, metadata0, metadata1, errors_v2 + * metadata1 is incrementally extracted to save on + * register pressure */ t0 = _mm256_permute2f128_si256(desc6, desc7, 0x20); t1 = _mm256_permute2f128_si256(desc6, desc7, 0x31); t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); rxcmp6_7 = _mm256_blend_epi32(t0, t1, 0x66); + lo6_7 = t0; t0 = _mm256_permute2f128_si256(desc4, desc5, 0x20); t1 = _mm256_permute2f128_si256(desc4, desc5, 0x31); t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); rxcmp4_5 = _mm256_blend_epi32(t0, t1, 0x66); + md1_4567 = _mm256_unpackhi_epi32(t0, lo6_7); t0 = _mm256_permute2f128_si256(desc2, desc3, 0x20); t1 = _mm256_permute2f128_si256(desc2, desc3, 0x31); t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); rxcmp2_3 = _mm256_blend_epi32(t0, t1, 0x66); + lo2_3 = t0; t0 = _mm256_permute2f128_si256(desc0, desc1, 0x20); t1 = _mm256_permute2f128_si256(desc0, desc1, 0x31); t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); rxcmp0_1 = _mm256_blend_epi32(t0, t1, 0x66); + md1_0123 = _mm256_unpackhi_epi32(t0, lo2_3); + metadata1 = _mm256_unpacklo_epi64(md1_0123, md1_4567); - /* Extract flags_type from low completion for eight packets */ - t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); - t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); - flags_type = _mm256_unpacklo_epi64(t0, t1); - - /* Compute ptype_idx from flags_type itype field */ - ptype_idx = _mm256_and_si256(flags_type, flags_type_mask); - ptype_idx = _mm256_srli_epi32(ptype_idx, - RX_PKT_V3_CMPL_FLAGS_ITYPE_SFT - - BNXT_PTYPE_TBL_TYPE_SFT); - - /* Extract flags2 from high completion */ + /* Extract flags2 from high completion for eight packets */ t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); flags2 = _mm256_unpackhi_epi64(t0, t1); + /* fs mask used for RX_PKT_CMPL_CALC */ + cs_calc = _mm256_and_si256(flags2, mask_fs); + cs_valid = _mm256_cmpeq_epi32(cs_calc, _mm256_setzero_si256()); - t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_ip_type_mask), - RX_PKT_V3_CMPL_FLAGS2_IP_TYPE_SFT - - BNXT_PTYPE_TBL_IP_VER_SFT); - ptype_idx = _mm256_or_si256(ptype_idx, t0); - - /* - * Extract metadata1 (contains VLAN valid bit) from LOW completion. - * metadata1_payload_offset is at word 2 of rxcmp (low 128 bits of desc). + /* Extract metadata0 and errors from high completion */ + t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); + errors_v2 = _mm256_unpacklo_epi64(t0, t1); + /* mask_fs used in place of RX_PKT_CMPL_ERRORS_T_L4_CS_ERROR | + * RX_PKT_CMPL_ERRORS_T_IP_CS_ERROR | RX_PKT_CMPL_ERRORS_L4_CS_ERROR | + * RX_PKT_CMPL_ERRORS_IP_CS_ERROR */ - { - __m128i m01, m23, hi; - hi = - _mm_unpacklo_epi64(_mm_unpackhi_epi32(_mm256_castsi256_si128(desc4), - _mm256_castsi256_si128(desc5)), - _mm_unpackhi_epi32(_mm256_castsi256_si128(desc6), - _mm256_castsi256_si128(desc7))); - m01 = _mm_unpackhi_epi32(_mm256_castsi256_si128(desc0), - _mm256_castsi256_si128(desc1)); - m23 = _mm_unpackhi_epi32(_mm256_castsi256_si128(desc2), - _mm256_castsi256_si128(desc3)); - metadata1 = - _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(m01, - m23)), hi, 1); - } - metadata1 = _mm256_srli_epi32(metadata1, 16); - - t0 = _mm256_srli_epi32(_mm256_and_si256(metadata1, metadata1_valid_mask), - RX_PKT_V3_CMPL_METADATA1_VALID_SFT - - BNXT_PTYPE_TBL_VLAN_SFT); - ptype_idx = _mm256_or_si256(ptype_idx, t0); + errors_csum_idx = _mm256_srli_epi32(_mm256_and_si256(errors_v2, + _mm256_slli_epi32(mask_fs, 4)), 4); + errors_csum_idx = _mm256_andnot_si256(cs_valid, errors_csum_idx); /* - * Load ptypes for eight packets using gather. + * Load ol_flags for eight packets using gather. Gather + * operations have extremely high latency (~19 cycles), + * execution and use of result should be separated as much + * as possible. */ - ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table, - ptype_idx, sizeof(uint32_t)); - - /* Extract RSS valid flags for eight packets */ - rss_flags = _mm256_and_si256(flags_type, rss_mask); - rss_flags = _mm256_srli_epi32(rss_flags, 9); - - /* Extract metadata0 (contains vlan_tci) and errors from high completion */ - t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); - t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); - meta0_err = _mm256_unpacklo_epi64(t0, t1); + ol_flags = _mm256_i32gather_epi32((const int *)errors_to_olflags_v3, + errors_csum_idx, sizeof(uint32_t)); - /* Extract vlan_tci from high 16 bits of meta0_err (metadata0) */ - vlan_tci = _mm256_and_si256(_mm256_srli_epi32(meta0_err, 16), vlan_tci_mask); + /* Exctract if the packet is VLAN and the VLAN tci */ + metadata1 = _mm256_srli_epi32(metadata1, 16); + /* mask_1s used in place of RX_PKT_V3_CMPL_METADATA1_VALID */ + ptype_idx = _mm256_srli_epi32(_mm256_and_si256(metadata1, + _mm256_slli_epi32(mask_1s, 15)), + RX_PKT_V3_CMPL_METADATA1_VALID_SFT - + BNXT_PTYPE_TBL_VLAN_SFT); - vlan_flags = _mm256_and_si256(metadata1, metadata1_valid_mask); - vlan_flags = _mm256_min_epu32(vlan_flags, _mm256_set1_epi32(1)); + vlan_flags = _mm256_and_si256(metadata1, _mm256_slli_epi32(mask_1s, 15)); + vlan_flags = _mm256_min_epu32(vlan_flags, mask_1s); if (vnic->vlan_strip) { vlan_flags = _mm256_or_si256(vlan_flags, - _mm256_slli_epi32(vlan_flags, 6)); + _mm256_slli_epi32(vlan_flags, 6)); } - errors_v2 = meta0_err; + /* Extract flags_type from low completion for eight packets */ + t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); + flags_type = _mm256_unpacklo_epi64(t0, t1); - errors = _mm256_srli_epi32(_mm256_and_si256(meta0_err, cs_err_mask), 4); + /* Compute ptype_idx from flags_type itype field + * mask_fs is used in place of + * RX_PKT_V3_CMPL_FLAGS_ITYPE_MASK + */ + t0 = _mm256_and_si256(flags_type, + _mm256_slli_epi32(mask_fs, 12)); + t0 = _mm256_srli_epi32(t0, RX_PKT_V3_CMPL_FLAGS_ITYPE_SFT - + BNXT_PTYPE_TBL_TYPE_SFT); + ptype_idx = _mm256_or_si256(ptype_idx, t0); - cs_calc = _mm256_and_si256(flags2, cs_calc_mask); + /* Extract flags2 from low completion for eight packets + * flags2 is re-extracted to save on registers + */ + t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); + flags2 = _mm256_unpackhi_epi64(t0, t1); + + /* mask_fs is being used in place of + * RX_PKT_CMPL_CALC + */ + cs_calc = _mm256_and_si256(flags2, mask_fs); cs_valid = _mm256_cmpeq_epi32(cs_calc, _mm256_setzero_si256()); - errors = _mm256_andnot_si256(cs_valid, errors); - ol_flags = _mm256_i32gather_epi32((const int *)errors_to_olflags_v3, - errors, sizeof(uint32_t)); - __m256i unknown_flags = _mm256_and_si256(cs_valid, - _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN)); - ol_flags = _mm256_or_si256(ol_flags, unknown_flags); + ol_flags = _mm256_andnot_si256(cs_valid, ol_flags); - const __m256i perm_msk = - _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk); - info3_v = _mm256_and_si256(errors_v2, info3_v_mask); + /* mask_1s is being used in place of + * RX_PKT_V3_CMPL_HI_FLAGS2_IP_TYPE + */ + t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, + _mm256_slli_epi32(mask_1s, 8)), + RX_PKT_V3_CMPL_FLAGS2_IP_TYPE_SFT - + BNXT_PTYPE_TBL_IP_VER_SFT); + ptype_idx = _mm256_or_si256(ptype_idx, t0); + + /* + * Load ptypes for eight packets using gather. Gather operations + * have extremely high latency (~19 cycles), execution and use + * of result should be separated as much as possible. + */ + ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table, + ptype_idx, sizeof(uint32_t)); + + /* + * Pack the 128-bit array of valid descriptor flags into 64 + * bits and count the number of set bits in order to determine + * the number of valid descriptors. + * mask_1s is used in place of CMPL_BASE_V + */ + info3_v = _mm256_and_si256(errors_v2, mask_1s); info3_v = _mm256_xor_si256(info3_v, valid_target); info3_v = _mm256_packs_epi32(info3_v, _mm256_setzero_si256()); @@ -1181,6 +1173,11 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk if (num_valid == 0) break; + /* Extract flags_type from low completion for eight packets*/ + t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); + flags_type = _mm256_unpacklo_epi64(t0, t1); + mbuf01 = _mm256_shuffle_epi8(rxcmp0_1, shuf_msk); mbuf23 = _mm256_shuffle_epi8(rxcmp2_3, shuf_msk); mbuf45 = _mm256_shuffle_epi8(rxcmp4_5, shuf_msk); @@ -1194,29 +1191,18 @@ recv_burst_vec_avx2_v3(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pk mbuf67 = _mm256_blend_epi32(mbuf67, _mm256_srli_si256(ptypes, 12), 0x11); - const __m256i tci_perm_01 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0); - const __m256i tci_perm_23 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2); - const __m256i tci_perm_45 = _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4); - const __m256i tci_perm_67 = _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6); - - mbuf01 = _mm256_blend_epi16(mbuf01, - _mm256_slli_si256(_mm256_permutevar8x32_epi32(vlan_tci, - tci_perm_01), 10), 0x20); - mbuf23 = _mm256_blend_epi16(mbuf23, - _mm256_slli_si256(_mm256_permutevar8x32_epi32(vlan_tci, - tci_perm_23), 10), 0x20); - mbuf45 = _mm256_blend_epi16(mbuf45, - _mm256_slli_si256(_mm256_permutevar8x32_epi32(vlan_tci, - tci_perm_45), 10), 0x20); - mbuf67 = _mm256_blend_epi16(mbuf67, - _mm256_slli_si256(_mm256_permutevar8x32_epi32(vlan_tci, - tci_perm_67), 10), 0x20); - rearm0 = _mm256_permute2f128_si256(mbuf_init, mbuf01, 0x20); rearm1 = _mm256_blend_epi32(mbuf_init, mbuf01, 0xF0); rearm2 = _mm256_permute2f128_si256(mbuf_init, mbuf23, 0x20); rearm3 = _mm256_blend_epi32(mbuf_init, mbuf23, 0xF0); + /* Extract RSS valid flags for eight packets + * mask_1s is being used in place of + * RX_PKT_V3_CMPL_FLAGS_RSS_VALID + */ + rss_flags = _mm256_and_si256(flags_type, + _mm256_slli_epi32(mask_1s, 10)); + rss_flags = _mm256_srli_epi32(rss_flags, 9); ol_flags = _mm256_or_si256(ol_flags, rss_flags); ol_flags = _mm256_or_si256(ol_flags, vlan_flags); ol_flags_hi = _mm256_permute2f128_si256(ol_flags, diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_common.h b/drivers/net/bnxt/bnxt_rxtx_vec_common.h index e8da010dc3..d8659d1001 100644 --- a/drivers/net/bnxt/bnxt_rxtx_vec_common.h +++ b/drivers/net/bnxt/bnxt_rxtx_vec_common.h @@ -178,7 +178,7 @@ bnxt_tx_cmp_vec(struct bnxt_tx_queue *txq, uint32_t nr_pkts) txr->tx_raw_cons = raw_cons; } -static const uint64_t errors_to_olflags_v3[16] = { +static const uint32_t errors_to_olflags_v3[16] = { RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD, RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | -- 2.47.3

