Add a 64-byte loop that maintains 4 fold registers and processes
64 bytes at a time. The 4x fold registers is then reduced to 16 byte
single fold, similar to AVX512 implementation. This technique is
described in the paper by Intel:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"

This results in roughly 50% performance improvement due to better ILP
for large input sizes like 1024.

Signed-off-by: Shreesh Adiga <[email protected]>
---
 lib/net/net_crc_sse.c | 59 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 3b6fbfecac..dfef8ecc59 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -14,6 +14,7 @@
 /** PCLMULQDQ CRC computation context structure */
 struct crc_pclmulqdq_ctx {
        __m128i rk1_rk2;
+       __m128i rk3_rk4;
        __m128i rk5_rk6;
        __m128i rk7_rk8;
 };
@@ -150,9 +151,36 @@ crc32_eth_calc_pclmulqdq(
        temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 
        /**
-        * Folding all data into single 16 byte data block
-        * Assumes: fold holds first 16 bytes of data
+        * Folding all data into 4 parallel 16 byte data block
+        * Later folds 4 parallel blocks into single fold block
         */
+       if (likely(data_len >= 64)) {
+               __m128i fold1, fold2, fold3, fold4;
+               __m128i temp1, temp2, temp3, temp4;
+               fold1 = _mm_loadu_si128((const __m128i *)(data +  0));
+               fold2 = _mm_loadu_si128((const __m128i *)(data + 16));
+               fold3 = _mm_loadu_si128((const __m128i *)(data + 32));
+               fold4 = _mm_loadu_si128((const __m128i *)(data + 48));
+               fold1 = _mm_xor_si128(fold1, temp);
+               k = params->rk1_rk2;
+
+               for (n = 64; (n + 64) <= data_len; n += 64) {
+                       temp1 = _mm_loadu_si128((const __m128i *)&data[n]);
+                       temp2 = _mm_loadu_si128((const __m128i *)&data[n + 16]);
+                       temp3 = _mm_loadu_si128((const __m128i *)&data[n + 32]);
+                       temp4 = _mm_loadu_si128((const __m128i *)&data[n + 48]);
+                       fold1 = crcr32_folding_round(temp1, k, fold1);
+                       fold2 = crcr32_folding_round(temp2, k, fold2);
+                       fold3 = crcr32_folding_round(temp3, k, fold3);
+                       fold4 = crcr32_folding_round(temp4, k, fold4);
+               }
+
+               k = params->rk3_rk4;
+               fold1 = crcr32_folding_round(fold2, k, fold1);
+               fold1 = crcr32_folding_round(fold3, k, fold1);
+               fold = crcr32_folding_round(fold4, k, fold1);
+               goto single_fold_loop;
+       }
 
        if (unlikely(data_len < 32)) {
                if (unlikely(data_len == 16)) {
@@ -182,7 +210,7 @@ crc32_eth_calc_pclmulqdq(
                fold = _mm_loadu_si128((const __m128i *)data);
                fold = _mm_xor_si128(fold, temp);
                n = 16;
-               k = params->rk1_rk2;
+               k = params->rk3_rk4;
                goto partial_bytes;
        }
 
@@ -191,9 +219,12 @@ crc32_eth_calc_pclmulqdq(
        fold = _mm_loadu_si128((const __m128i *)data);
        fold = _mm_xor_si128(fold, temp);
 
-       /** Main folding loop - the last 16 bytes is processed separately */
-       k = params->rk1_rk2;
-       for (n = 16; (n + 16) <= data_len; n += 16) {
+       /** Single folding loop - the last 16 bytes is processed separately */
+       k = params->rk3_rk4;
+       n = 16;
+
+single_fold_loop:
+       for (; (n + 16) <= data_len; n += 16) {
                temp = _mm_loadu_si128((const __m128i *)&data[n]);
                fold = crcr32_folding_round(temp, k, fold);
        }
@@ -236,12 +267,14 @@ crc32_eth_calc_pclmulqdq(
 void
 rte_net_crc_sse42_init(void)
 {
-       uint64_t k1, k2, k5, k6;
+       uint64_t k1, k2, k3, k4, k5, k6;
        uint64_t p = 0, q = 0;
 
        /** Initialize CRC16 data */
-       k1 = 0x189aeLLU;
-       k2 = 0x8e10LLU;
+       k1 = 0x14ff2LLU;
+       k2 = 0x19a3cLLU;
+       k3 = 0x189aeLLU;
+       k4 = 0x8e10LLU;
        k5 = 0x189aeLLU;
        k6 = 0x114aaLLU;
        q =  0x11c581910LLU;
@@ -249,12 +282,15 @@ rte_net_crc_sse42_init(void)
 
        /** Save the params in context structure */
        crc16_ccitt_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+       crc16_ccitt_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
        crc16_ccitt_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
        crc16_ccitt_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
 
        /** Initialize CRC32 data */
-       k1 = 0xccaa009eLLU;
-       k2 = 0x1751997d0LLU;
+       k1 = 0x1c6e41596LLU;
+       k2 = 0x154442bd4LLU;
+       k3 = 0xccaa009eLLU;
+       k4 = 0x1751997d0LLU;
        k5 = 0xccaa009eLLU;
        k6 = 0x163cd6124LLU;
        q =  0x1f7011640LLU;
@@ -262,6 +298,7 @@ rte_net_crc_sse42_init(void)
 
        /** Save the params in context structure */
        crc32_eth_pclmulqdq.rk1_rk2 = _mm_set_epi64x(k2, k1);
+       crc32_eth_pclmulqdq.rk3_rk4 = _mm_set_epi64x(k4, k3);
        crc32_eth_pclmulqdq.rk5_rk6 = _mm_set_epi64x(k6, k5);
        crc32_eth_pclmulqdq.rk7_rk8 = _mm_set_epi64x(p, q);
 }
-- 
2.53.0

Reply via email to