PR #22479 opened by Shreesh Adiga (tantei3)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22479
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22479.patch

Previously submitted in https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20751 
however this was not part of https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119 
which only merged the SSE4.2 clmul implementation.
Thus I'm resubmitting the AVX512ICL implementation.
```
benchmarking with Linux Perf Monitoring API
nop: 92.0
checkasm: using random seed 1665032634
checkasm: bench runs 4096 (1 << 12)
CLMUL:
 - crc.crc [OK]
AVX-512ICL:
 - crc.crc [OK]
checkasm: all 18 tests passed
crc_8_ATM_c:                                            60.0 ( 1.00x)
crc_8_ATM_clmul:                                        21.1 ( 2.84x)
crc_8_ATM_avx512icl:                                    20.0 ( 3.01x)
crc_8_EBU_c:                                         11680.9 ( 1.00x)
crc_8_EBU_clmul:                                      1253.8 ( 9.32x)
crc_8_EBU_avx512icl:                                   365.0 (32.00x)
crc_16_ANSI_c:                                       15852.4 ( 1.00x)
crc_16_ANSI_clmul:                                    1693.8 ( 9.36x)
crc_16_ANSI_avx512icl:                                 524.9 (30.20x)
crc_16_ANSI_LE_c:                                     2619.3 ( 1.00x)
crc_16_ANSI_LE_clmul:                                  274.8 ( 9.53x)
crc_16_ANSI_LE_avx512icl:                              105.9 (24.74x)
crc_16_CCITT_c:                                       8389.6 ( 1.00x)
crc_16_CCITT_clmul:                                    903.1 ( 9.29x)
crc_16_CCITT_avx512icl:                                269.9 (31.09x)
crc_24_IEEE_c:                                       21281.5 ( 1.00x)
crc_24_IEEE_clmul:                                    2273.4 ( 9.36x)
crc_24_IEEE_avx512icl:                                 614.7 (34.62x)
crc_32_IEEE_c:                                       12903.3 ( 1.00x)
crc_32_IEEE_clmul:                                    1379.9 ( 9.35x)
crc_32_IEEE_avx512icl:                                 401.5 (32.14x)
crc_32_IEEE_LE_c:                                    11729.0 ( 1.00x)
crc_32_IEEE_LE_clmul:                                 1188.8 ( 9.87x)
crc_32_IEEE_LE_avx512icl:                              327.5 (35.81x)
crc_custom_polynomial_c:                             17736.0 ( 1.00x)
crc_custom_polynomial_clmul:                          1792.1 ( 9.90x)
crc_custom_polynomial_avx512icl:                       471.1 (37.65x)
```


>From 71b7e6745597e0edf067f919a7c4b8a4e591c7f1 Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Wed, 11 Mar 2026 21:12:47 +0530
Subject: [PATCH] avutil/crc: add x86 AVX512ICL SIMD implementation for av_crc

Extends the SSE4.2 implementation to use AVX512 ZMM registers.
Initially does 256 bytes 4x 512bit parallel fold followed by
64 byte ZMM 512bit fold. Then the 512 bit is reduced to 128 bit
followed by further 128 bit reduction loop and tail similar to
SSE4.2 clmul implementation.

This results in overall 4x speedup compared to clmul implementation:
CLMUL:
 - crc.crc [OK]
AVX-512ICL:
 - crc.crc [OK]
checkasm: all 18 tests passed
crc_8_ATM_c:                                            60.0 ( 1.00x)
crc_8_ATM_clmul:                                        21.1 ( 2.84x)
crc_8_ATM_avx512icl:                                    20.0 ( 3.01x)
crc_8_EBU_c:                                         11680.9 ( 1.00x)
crc_8_EBU_clmul:                                      1253.8 ( 9.32x)
crc_8_EBU_avx512icl:                                   365.0 (32.00x)
crc_16_ANSI_c:                                       15852.4 ( 1.00x)
crc_16_ANSI_clmul:                                    1693.8 ( 9.36x)
crc_16_ANSI_avx512icl:                                 524.9 (30.20x)
crc_16_ANSI_LE_c:                                     2619.3 ( 1.00x)
crc_16_ANSI_LE_clmul:                                  274.8 ( 9.53x)
crc_16_ANSI_LE_avx512icl:                              105.9 (24.74x)
crc_16_CCITT_c:                                       8389.6 ( 1.00x)
crc_16_CCITT_clmul:                                    903.1 ( 9.29x)
crc_16_CCITT_avx512icl:                                269.9 (31.09x)
crc_24_IEEE_c:                                       21281.5 ( 1.00x)
crc_24_IEEE_clmul:                                    2273.4 ( 9.36x)
crc_24_IEEE_avx512icl:                                 614.7 (34.62x)
crc_32_IEEE_c:                                       12903.3 ( 1.00x)
crc_32_IEEE_clmul:                                    1379.9 ( 9.35x)
crc_32_IEEE_avx512icl:                                 401.5 (32.14x)
crc_32_IEEE_LE_c:                                    11729.0 ( 1.00x)
crc_32_IEEE_LE_clmul:                                 1188.8 ( 9.87x)
crc_32_IEEE_LE_avx512icl:                              327.5 (35.81x)
crc_custom_polynomial_c:                             17736.0 ( 1.00x)
crc_custom_polynomial_clmul:                          1792.1 ( 9.90x)
crc_custom_polynomial_avx512icl:                       471.1 (37.65x)
---
 libavutil/x86/crc.asm | 253 +++++++++++++++++++++++++++++-------------
 libavutil/x86/crc.h   | 156 ++++++++++++++++++++++++--
 2 files changed, 321 insertions(+), 88 deletions(-)

diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm
index 4f5673fbd7..7153a7247b 100644
--- a/libavutil/x86/crc.asm
+++ b/libavutil/x86/crc.asm
@@ -56,7 +56,11 @@ SECTION .text
 ; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
 %if %1 == 1
     pxor      %4, %4
+    %if mmsize == 64
+    vmovss    %4, %2, %4
+    %else
     pblendw   %4, %2, 0xfc
+    %endif
     mova      %2, %4
     pclmulqdq %4, %3, 0x00
     pxor      %4, %2
@@ -75,11 +79,17 @@ SECTION .text
 
 %macro FOLD_SINGLE 4
 ; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block
+%if mmsize == 64
+    pclmulqdq  %1, %2, %3, 0x01
+    pclmulqdq  %2, %2, %3, 0x10
+    vpternlogq %2, %1, %4, 0x96
+%else
     mova      %1, %2
     pclmulqdq %1, %3, 0x01
     pxor      %1, %4
     pclmulqdq %2, %3, 0x10
     pxor      %2, %1
+%endif
 %endmacro
 
 %macro XMM_SHIFT_LEFT 4
@@ -137,16 +147,33 @@ SECTION .text
     ; fall through, %6 label is expected to be next instruction
 %endmacro
 
+%macro VBROADCASTI32x4 3
+; %1 dst reg ; %2 address for AVX512ICL ; %3 address for SSE4.2
+    %if mmsize == 64
+        vbroadcasti32x4 %1, [%2]
+    %else
+        movu            %1, [%3]
+    %endif
+%endmacro
+
 %macro CRC 1
 %define CTX r0+4
 
;-----------------------------------------------------------------------------------------------
 ; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, 
size_t length
 
;-----------------------------------------------------------------------------------------------
 ; %1 == 1 - LE format
-%if %1 == 1
-cglobal crc_le, 4, 6, 6+4*ARCH_X86_64, 0x10
+%if mmsize == 64
+    %if %1 == 1
+    cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0
+    %else
+    cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0
+    %endif
 %else
-cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
+    %if %1 == 1
+    cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0x10
+    %else
+    cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
+    %endif
 %endif
 
 %if ARCH_X86_32
@@ -154,37 +181,41 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
 %endif
 
 %if %1 == 0
-    mova  m10, [reverse_shuffle]
+    VBROADCASTI32x4 m10, reverse_shuffle, reverse_shuffle
 %endif
 
-    movd   m4, r1d
+%if mmsize == 64
+    pxor    m4, m4
+%endif
+    movd   xm4, r1d
+
 %if ARCH_X86_32
     ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32
-    jmp   .less_than_64bytes
+    jmp            .less_than_4x_mmsize
 %else
-    cmp    r3, 64
-    jb    .less_than_64bytes
-    movu   m1, [r2 +  0]
-    movu   m3, [r2 + 16]
-    movu   m2, [r2 + 32]
-    movu   m0, [r2 + 48]
-    pxor   m1, m4
+    cmp             r3, 4 * mmsize
+    jb             .less_than_4x_mmsize
+    movu            m1, [r2 + 0 * mmsize]
+    movu            m3, [r2 + 1 * mmsize]
+    movu            m2, [r2 + 2 * mmsize]
+    movu            m0, [r2 + 3 * mmsize]
+    pxor            m1, m4
 %if %1 == 0
-    pshufb m0, m10
-    pshufb m1, m10
-    pshufb m2, m10
-    pshufb m3, m10
+    pshufb          m0, m10
+    pshufb          m1, m10
+    pshufb          m2, m10
+    pshufb          m3, m10
 %endif
-    mov    r4, 64
-    cmp    r3, 128
-    jb    .reduce_4x_to_1
-    movu   m4, [CTX]
+    mov             r4, 4 * mmsize
+    cmp             r3, 8 * mmsize
+    jb             .reduce_4x_to_1
+    VBROADCASTI32x4 m4, CTX + 64, CTX
 
 .fold_4x_loop:
-        movu        m6, [r2 + r4 +  0]
-        movu        m7, [r2 + r4 + 16]
-        movu        m8, [r2 + r4 + 32]
-        movu        m9, [r2 + r4 + 48]
+        movu        m6, [r2 + r4 + 0 * mmsize]
+        movu        m7, [r2 + r4 + 1 * mmsize]
+        movu        m8, [r2 + r4 + 2 * mmsize]
+        movu        m9, [r2 + r4 + 3 * mmsize]
 %if %1 == 0
         pshufb      m6, m10
         pshufb      m7, m10
@@ -195,22 +226,26 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
         FOLD_SINGLE m5, m3, m4, m7
         FOLD_SINGLE m5, m2, m4, m8
         FOLD_SINGLE m5, m0, m4, m9
-        add         r4, 64
-        lea         r5, [r4 + 64]
+        add         r4, 4 * mmsize
+        lea         r5, [r4 + 4 * mmsize]
         cmp         r5, r3
         jbe        .fold_4x_loop
 
 .reduce_4x_to_1:
-    movu        m4, [CTX + 16]
-    FOLD_SINGLE m5, m1, m4, m3
-    FOLD_SINGLE m5, m1, m4, m2
-    FOLD_SINGLE m5, m1, m4, m0
+    VBROADCASTI32x4 m4, CTX, CTX + 16
+    FOLD_SINGLE     m5,  m1, m4, m3
+    FOLD_SINGLE     m5,  m1, m4, m2
+    FOLD_SINGLE     m5,  m1, m4, m0
 %endif
 
 .fold_1x_pre:
-    lea  r5, [r4 + 16]
+    lea  r5, [r4 + mmsize]
     cmp  r5, r3
+%if mmsize == 64
+    ja  .fold_zmm_to_xmm
+%else
     ja  .partial_block
+%endif
 
 .fold_1x_loop:
         movu        m2, [r2 + r4]
@@ -218,81 +253,141 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
         pshufb      m2, m10
 %endif
         FOLD_SINGLE m5, m1, m4, m2
-        add         r4, 16
-        lea         r5, [r4 + 16]
+        add         r4, mmsize
+        lea         r5, [r4 + mmsize]
         cmp         r5, r3
         jbe        .fold_1x_loop
 
+%if mmsize == 64
+.fold_zmm_to_xmm:
+    movu            xm4, [CTX + 16]
+    vextracti32x4   xm0,  m1, 1
+    vextracti32x4   xm2,  m1, 2
+    vextracti32x4   xm3,  m1, 3
+    FOLD_SINGLE     xm5, xm1, xm4, xm0
+    FOLD_SINGLE     xm5, xm1, xm4, xm2
+    FOLD_SINGLE     xm5, xm1, xm4, xm3
+
+.fold_16b_pre:
+    lea r5, [r4 + 16]
+    cmp r5, r3
+    ja .partial_block
+
+.fold_16b_loop:
+        movu        xm2, [r2 + r4]
+%if %1 == 0
+        pshufb      xm2, xm10
+%endif
+        FOLD_SINGLE xm5, xm1, xm4, xm2
+        add          r4, 16
+        lea          r5, [r4 + 16]
+        cmp          r5, r3
+        jbe         .fold_16b_loop
+%endif
+
 .partial_block:
-    cmp         r4, r3
-    jae        .reduce_128_to_64
-    movu        m2, [r2 + r3 - 16]
-    and         r3, 0xf
-    lea         r4, [partial_bytes_shuf_tab]
-    movu        m0, [r3 + r4]
+    cmp             r4, r3
+    jae           .reduce_128_to_64
+    movu           xm2, [r2 + r3 - 16]
+    and             r3, 0xf
+    lea             r4, [partial_bytes_shuf_tab]
+    movu           xm0, [r3 + r4]
 %if %1 == 0
-    pshufb      m1, m10
+    pshufb         xm1, xm10
 %endif
-    mova        m3, m1
-    pcmpeqd     m5, m5 ; m5 = _mm_set1_epi8(0xff)
-    pxor        m5, m0
-    pshufb      m3, m5
-    pblendvb    m2, m3, m0
-    pshufb      m1, m0
+    mova           xm3, xm1
+%if mmsize == 64
+    mova           xm5, xm0
+    vpternlogq     xm5, xm0, xm0, 0xf ; xm5 = ~xm0
+    vpmovb2m        k1, xm0
+    pshufb         xm3, xm5
+    vpblendmb  xm2{k1}, xm2, xm3
+%else
+    pcmpeqd        xm5, xm5 ; m5 = _mm_set1_epi8(0xff)
+    pxor           xm5, xm0
+    pshufb         xm3, xm5
+    pblendvb       xm2, xm3, xm0
+%endif
+    pshufb         xm1, xm0
 %if %1 == 0
-    pshufb      m1, m10
-    pshufb      m2, m10
+    pshufb         xm1, xm10
+    pshufb         xm2, xm10
 %endif
-    FOLD_SINGLE m5, m1, m4, m2
+    FOLD_SINGLE    xm5, xm1, xm4, xm2
 
 .reduce_128_to_64:
-    movu           m4, [CTX + 32]
-    FOLD_128_TO_64 %1, m1, m4, m5
+    movu           xm4, [CTX + 32]
+    FOLD_128_TO_64  %1, xm1, xm4, xm5
 .reduce_64_to_32:
-    movu           m4, [CTX + 48]
-    FOLD_64_TO_32  %1, m1, m4, m5
+    movu           xm4, [CTX + 48]
+    FOLD_64_TO_32   %1, xm1, xm4, xm5
     RET
 
-.less_than_64bytes:
-    cmp    r3, 16
-    jb    .less_than_16bytes
-    movu   m1, [r2]
-    pxor   m1, m4
+.less_than_4x_mmsize:
+    cmp             r3, mmsize
+    jb             .less_than_mmsize
+    movu            m1, [r2]
+    pxor            m1, m4
 %if %1 == 0
-    pshufb m1, m10
+    pshufb          m1, m10
 %endif
-    mov    r4, 16
-    movu   m4, [CTX + 16]
-    jmp   .fold_1x_pre
+    mov             r4, mmsize
+    VBROADCASTI32x4 m4, CTX, CTX + 16
+    jmp            .fold_1x_pre
+
+.less_than_mmsize:
+%if mmsize == 64
+    cmp     r3, 16
+    jb    .less_than_16bytes
+    movu   xm1, [r2]
+    pxor   xm1, xm4
+%if %1 == 0
+    pshufb xm1, xm10
+%endif
+    mov     r4, 16
+    movu   xm4, [CTX + 16]
+    jmp   .fold_16b_pre
 
 .less_than_16bytes:
-    pxor           m1, m1
-    movu        [rsp], m1
-    MEMCPY_0_15   rsp, r2, r3, r1, r4, .memcpy_done
+    mov                  r4d, -1
+    shlx                 r4d, r4d, r3d
+    not                  r4d
+    kmovw                 k1, r4d
+    vmovdqu8      xm1{k1}{z}, [r2]
+%else
+    pxor                  m1, m1
+    movu               [rsp], m1
+    MEMCPY_0_15          rsp, r2, r3, r1, r4, .memcpy_done
 
 .memcpy_done:
-    movu           m1, [rsp]
-    pxor           m1, m4
-    cmp            r3, 5
-    jb            .less_than_5bytes
-    XMM_SHIFT_LEFT m1, (16 - r3), m2, r4
-%if %1 == 0
-    pshufb         m1, m10
+    movu                  m1, [rsp]
 %endif
-    jmp           .reduce_128_to_64
+
+    pxor                 xm1, xm4
+    cmp                   r3, 5
+    jb                  .less_than_5bytes
+    XMM_SHIFT_LEFT       xm1, (16 - r3), xm2, r4
+%if %1 == 0
+    pshufb               xm1, xm10
+%endif
+    jmp                 .reduce_128_to_64
 
 .less_than_5bytes:
 %if %1 == 0
-    XMM_SHIFT_LEFT m1, (4 - r3), m2, r4
-    movq          m10, [reverse_shuffle + 8] ; 0x0001020304050607
-    pshufb         m1, m10
+    XMM_SHIFT_LEFT       xm1, (4 - r3), xm2, r4
+    movq                xm10, [reverse_shuffle + 8] ; 0x0001020304050607
+    pshufb               xm1, xm10
 %else
-    XMM_SHIFT_LEFT m1, (8 - r3), m2, r4
+    XMM_SHIFT_LEFT       xm1, (8 - r3), xm2, r4
 %endif
-    jmp .reduce_64_to_32
+    jmp                 .reduce_64_to_32
 
 %endmacro
 
 INIT_XMM clmul
 CRC 0
 CRC 1
+
+INIT_ZMM avx512icl
+CRC 0
+CRC 1
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
index ef98ed318d..443fe753ac 100644
--- a/libavutil/x86/crc.h
+++ b/libavutil/x86/crc.h
@@ -30,6 +30,14 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/x86/cpu.h"
 
+enum {
+    CRC_C    = 0,
+    CLMUL_BE,
+    CLMUL_LE,
+    CLMUL_BE_AVX512ICL,
+    CLMUL_LE_AVX512ICL,
+};
+
 #if HAVE_CLMUL_EXTERNAL
 #include "libavutil/crc_internal.h"
 
@@ -40,12 +48,6 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
                          const uint8_t *buffer, size_t length);
 FF_VISIBILITY_POP_HIDDEN
 
-enum {
-    CRC_C    = 0,
-    CLMUL_BE,
-    CLMUL_LE,
-};
-
 static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = {
     [AV_CRC_8_ATM] = {
         CLMUL_BE,
@@ -142,11 +144,135 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int 
bits, uint32_t poly, int
 }
 #endif
 
+#if HAVE_AVX512ICL_EXTERNAL
+#include "libavutil/crc_internal.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc_avx512icl(const AVCRC *ctx, uint32_t crc,
+                          const uint8_t *buffer, size_t length);
+uint32_t ff_crc_le_avx512icl(const AVCRC *ctx, uint32_t crc,
+                             const uint8_t *buffer, size_t length);
+FF_VISIBILITY_POP_HIDDEN
+
+static const AVCRC crc_table_avx512icl[AV_CRC_MAX][21] = {
+    [AV_CRC_8_ATM] = {
+        CLMUL_BE_AVX512ICL,
+        0x32000000, 0x0, 0xbc000000, 0x0,
+        0xc4000000, 0x0, 0x94000000, 0x0,
+        0x62000000, 0x0, 0x79000000, 0x0,
+        0x07156a16, 0x1, 0x07000000, 0x1,
+        0xdf000000, 0x0, 0xd9000000, 0x0,
+    },
+    [AV_CRC_8_EBU] = {
+        CLMUL_BE_AVX512ICL,
+        0xb5000000, 0x0, 0xf3000000, 0x0,
+        0xfc000000, 0x0, 0x0d000000, 0x0,
+        0x6a000000, 0x0, 0x65000000, 0x0,
+        0x1c4b8192, 0x1, 0x1d000000, 0x1,
+        0x46000000, 0x0, 0x16000000, 0x0,
+    },
+    [AV_CRC_16_ANSI] = {
+        CLMUL_BE_AVX512ICL,
+        0xf9e30000, 0x0, 0x807d0000, 0x0,
+        0xf9130000, 0x0, 0xff830000, 0x0,
+        0x807b0000, 0x0, 0x86630000, 0x0,
+        0xfffbffe7, 0x1, 0x80050000, 0x1,
+        0xfe630000, 0x0, 0x7f870000, 0x0,
+    },
+    [AV_CRC_16_CCITT] = {
+        CLMUL_BE_AVX512ICL,
+        0x60190000, 0x0, 0x59b00000, 0x0,
+        0xd5f60000, 0x0, 0x45630000, 0x0,
+        0xaa510000, 0x0, 0xeb230000, 0x0,
+        0x11303471, 0x1, 0x10210000, 0x1,
+        0xcacd0000, 0x0, 0x16270000, 0x0,
+    },
+    [AV_CRC_24_IEEE] = {
+        CLMUL_BE_AVX512ICL,
+        0x1f428700, 0x0, 0x467d2400, 0x0,
+        0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
+        0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
+        0xf845fe24, 0x1, 0x864cfb00, 0x1,
+        0x09e45400, 0x0, 0xa79dfd00, 0x0,
+    },
+    [AV_CRC_32_IEEE] = {
+        CLMUL_BE_AVX512ICL,
+        0x8833794c, 0x0, 0xe6228b11, 0x0,
+        0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
+        0x490d678d, 0x0, 0xf200aa66, 0x0,
+        0x04d101df, 0x1, 0x04c11db7, 0x1,
+        0xcbcf3bcb, 0x0, 0x88fe2237, 0x0,
+    },
+    [AV_CRC_32_IEEE_LE] = {
+        CLMUL_LE_AVX512ICL,
+        0xc6e41596, 0x1, 0x54442bd4, 0x1,
+        0xccaa009e, 0x0, 0x751997d0, 0x1,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+        0x322d1430, 0x1, 0x1542778a, 0x1,
+    },
+    [AV_CRC_16_ANSI_LE] = {
+        CLMUL_LE_AVX512ICL,
+        0x0000bffa, 0x0, 0x1b0c2, 0x0,
+        0x00018cc2, 0x0, 0x1d0c2, 0x0,
+        0x00018cc2, 0x0, 0x1bc02, 0x0,
+        0xcfffbffe, 0x1, 0x14003, 0x0,
+        0x0001d99e, 0x0, 0x1bcc2, 0x0,
+    },
+};
+
+static inline void crc_init_x86_avx512icl(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
+{
+    uint64_t poly_;
+    if (le) {
+        // convert the reversed representation to regular form
+        poly = reverse(poly, bits) >> 1;
+    }
+    // convert to 32 degree polynomial
+    poly_ = ((uint64_t)poly) << (32 - bits);
+
+    uint64_t div;
+    uint8_t *dst = (uint8_t*)(ctx + 1);
+    if (le) {
+        ctx[0] = CLMUL_LE_AVX512ICL;
+        AV_WN64(dst,      xnmodp(4 * 128 - 32, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
+        uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
+        AV_WN64(dst + 16, tmp);
+        AV_WN64(dst + 24, xnmodp(128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst + 32, tmp);
+        AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
+        AV_WN64(dst + 64, xnmodp(4 * 512 - 32, poly_, 32, &div, le));
+        AV_WN64(dst + 72, xnmodp(4 * 512 + 32, poly_, 32, &div, le));
+    } else {
+        ctx[0] = CLMUL_BE_AVX512ICL;
+        AV_WN64(dst,      xnmodp(4 * 128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128, poly_, 32, &div, le));
+        AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 24, xnmodp(128, poly_, 32, &div, le));
+        AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
+        AV_WN64(dst + 56, poly_ | (1ULL << 32));
+        AV_WN64(dst + 64, xnmodp(4 * 512 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 72, xnmodp(4 * 512, poly_, 32, &div, le));
+    }
+}
+#endif
+
 static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id)
 {
-#if HAVE_CLMUL_EXTERNAL
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_AVX512ICL_EXTERNAL
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        return crc_table_avx512icl[crc_id];
+    }
+#endif
+
+#if HAVE_CLMUL_EXTERNAL
     if (EXTERNAL_CLMUL(cpu_flags)) {
         return crc_table_clmul[crc_id];
     }
@@ -156,9 +282,16 @@ static inline const AVCRC *ff_crc_get_table_x86(AVCRCId 
crc_id)
 
 static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
 {
-#if HAVE_CLMUL_EXTERNAL
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_AVX512ICL_EXTERNAL
+    if (EXTERNAL_AVX512ICL(cpu_flags)) {
+        crc_init_x86_avx512icl(ctx, le, bits, poly, ctx_size);
+        return 1;
+    }
+#endif
+
+#if HAVE_CLMUL_EXTERNAL
     if (EXTERNAL_CLMUL(cpu_flags)) {
         crc_init_x86(ctx, le, bits, poly, ctx_size);
         return 1;
@@ -175,7 +308,12 @@ static inline uint32_t ff_crc_x86(const AVCRC *ctx, 
uint32_t crc,
     case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length);
     case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length);
 #endif
-    default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE");
+
+#if HAVE_AVX512ICL_EXTERNAL
+    case CLMUL_BE_AVX512ICL: return ff_crc_avx512icl(ctx, crc, buffer, length);
+    case CLMUL_LE_AVX512ICL: return ff_crc_le_avx512icl(ctx, crc, buffer, 
length);
+#endif
+    default: av_unreachable("x86 CRC only uses CLMUL_BE, CLMUL_LE, 
CLMUL_BE_AVX512ICL and CLMUL_LE_AVX512ICL");
     }
     return 0;
 }
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to