PR #22479 opened by Shreesh Adiga (tantei3) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22479 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22479.patch
Previously submitted in https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20751 however this was not part of https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119 which only merged the SSE4.2 clmul implementation. Thus I'm resubmitting the AVX512ICL implementation. ``` benchmarking with Linux Perf Monitoring API nop: 92.0 checkasm: using random seed 1665032634 checkasm: bench runs 4096 (1 << 12) CLMUL: - crc.crc [OK] AVX-512ICL: - crc.crc [OK] checkasm: all 18 tests passed crc_8_ATM_c: 60.0 ( 1.00x) crc_8_ATM_clmul: 21.1 ( 2.84x) crc_8_ATM_avx512icl: 20.0 ( 3.01x) crc_8_EBU_c: 11680.9 ( 1.00x) crc_8_EBU_clmul: 1253.8 ( 9.32x) crc_8_EBU_avx512icl: 365.0 (32.00x) crc_16_ANSI_c: 15852.4 ( 1.00x) crc_16_ANSI_clmul: 1693.8 ( 9.36x) crc_16_ANSI_avx512icl: 524.9 (30.20x) crc_16_ANSI_LE_c: 2619.3 ( 1.00x) crc_16_ANSI_LE_clmul: 274.8 ( 9.53x) crc_16_ANSI_LE_avx512icl: 105.9 (24.74x) crc_16_CCITT_c: 8389.6 ( 1.00x) crc_16_CCITT_clmul: 903.1 ( 9.29x) crc_16_CCITT_avx512icl: 269.9 (31.09x) crc_24_IEEE_c: 21281.5 ( 1.00x) crc_24_IEEE_clmul: 2273.4 ( 9.36x) crc_24_IEEE_avx512icl: 614.7 (34.62x) crc_32_IEEE_c: 12903.3 ( 1.00x) crc_32_IEEE_clmul: 1379.9 ( 9.35x) crc_32_IEEE_avx512icl: 401.5 (32.14x) crc_32_IEEE_LE_c: 11729.0 ( 1.00x) crc_32_IEEE_LE_clmul: 1188.8 ( 9.87x) crc_32_IEEE_LE_avx512icl: 327.5 (35.81x) crc_custom_polynomial_c: 17736.0 ( 1.00x) crc_custom_polynomial_clmul: 1792.1 ( 9.90x) crc_custom_polynomial_avx512icl: 471.1 (37.65x) ``` >From 71b7e6745597e0edf067f919a7c4b8a4e591c7f1 Mon Sep 17 00:00:00 2001 From: Shreesh Adiga <[email protected]> Date: Wed, 11 Mar 2026 21:12:47 +0530 Subject: [PATCH] avutil/crc: add x86 AVX512ICL SIMD implementation for av_crc Extends the SSE4.2 implementation to use AVX512 ZMM registers. Initially does 256 bytes 4x 512bit parallel fold followed by 64 byte ZMM 512bit fold. Then the 512 bit is reduced to 128 bit followed by further 128 bit reduction loop and tail similar to SSE4.2 clmul implementation. This results in overall 4x speedup compared to clmul implementation: CLMUL: - crc.crc [OK] AVX-512ICL: - crc.crc [OK] checkasm: all 18 tests passed crc_8_ATM_c: 60.0 ( 1.00x) crc_8_ATM_clmul: 21.1 ( 2.84x) crc_8_ATM_avx512icl: 20.0 ( 3.01x) crc_8_EBU_c: 11680.9 ( 1.00x) crc_8_EBU_clmul: 1253.8 ( 9.32x) crc_8_EBU_avx512icl: 365.0 (32.00x) crc_16_ANSI_c: 15852.4 ( 1.00x) crc_16_ANSI_clmul: 1693.8 ( 9.36x) crc_16_ANSI_avx512icl: 524.9 (30.20x) crc_16_ANSI_LE_c: 2619.3 ( 1.00x) crc_16_ANSI_LE_clmul: 274.8 ( 9.53x) crc_16_ANSI_LE_avx512icl: 105.9 (24.74x) crc_16_CCITT_c: 8389.6 ( 1.00x) crc_16_CCITT_clmul: 903.1 ( 9.29x) crc_16_CCITT_avx512icl: 269.9 (31.09x) crc_24_IEEE_c: 21281.5 ( 1.00x) crc_24_IEEE_clmul: 2273.4 ( 9.36x) crc_24_IEEE_avx512icl: 614.7 (34.62x) crc_32_IEEE_c: 12903.3 ( 1.00x) crc_32_IEEE_clmul: 1379.9 ( 9.35x) crc_32_IEEE_avx512icl: 401.5 (32.14x) crc_32_IEEE_LE_c: 11729.0 ( 1.00x) crc_32_IEEE_LE_clmul: 1188.8 ( 9.87x) crc_32_IEEE_LE_avx512icl: 327.5 (35.81x) crc_custom_polynomial_c: 17736.0 ( 1.00x) crc_custom_polynomial_clmul: 1792.1 ( 9.90x) crc_custom_polynomial_avx512icl: 471.1 (37.65x) --- libavutil/x86/crc.asm | 253 +++++++++++++++++++++++++++++------------- libavutil/x86/crc.h | 156 ++++++++++++++++++++++++-- 2 files changed, 321 insertions(+), 88 deletions(-) diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm index 4f5673fbd7..7153a7247b 100644 --- a/libavutil/x86/crc.asm +++ b/libavutil/x86/crc.asm @@ -56,7 +56,11 @@ SECTION .text ; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg %if %1 == 1 pxor %4, %4 + %if mmsize == 64 + vmovss %4, %2, %4 + %else pblendw %4, %2, 0xfc + %endif mova %2, %4 pclmulqdq %4, %3, 0x00 pxor %4, %2 @@ -75,11 +79,17 @@ SECTION .text %macro FOLD_SINGLE 4 ; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block +%if mmsize == 64 + pclmulqdq %1, %2, %3, 0x01 + pclmulqdq %2, %2, %3, 0x10 + vpternlogq %2, %1, %4, 0x96 +%else mova %1, %2 pclmulqdq %1, %3, 0x01 pxor %1, %4 pclmulqdq %2, %3, 0x10 pxor %2, %1 +%endif %endmacro %macro XMM_SHIFT_LEFT 4 @@ -137,16 +147,33 @@ SECTION .text ; fall through, %6 label is expected to be next instruction %endmacro +%macro VBROADCASTI32x4 3 +; %1 dst reg ; %2 address for AVX512ICL ; %3 address for SSE4.2 + %if mmsize == 64 + vbroadcasti32x4 %1, [%2] + %else + movu %1, [%3] + %endif +%endmacro + %macro CRC 1 %define CTX r0+4 ;----------------------------------------------------------------------------------------------- ; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length ;----------------------------------------------------------------------------------------------- ; %1 == 1 - LE format -%if %1 == 1 -cglobal crc_le, 4, 6, 6+4*ARCH_X86_64, 0x10 +%if mmsize == 64 + %if %1 == 1 + cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0 + %else + cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0 + %endif %else -cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10 + %if %1 == 1 + cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0x10 + %else + cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 + %endif %endif %if ARCH_X86_32 @@ -154,37 +181,41 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10 %endif %if %1 == 0 - mova m10, [reverse_shuffle] + VBROADCASTI32x4 m10, reverse_shuffle, reverse_shuffle %endif - movd m4, r1d +%if mmsize == 64 + pxor m4, m4 +%endif + movd xm4, r1d + %if ARCH_X86_32 ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32 - jmp .less_than_64bytes + jmp .less_than_4x_mmsize %else - cmp r3, 64 - jb .less_than_64bytes - movu m1, [r2 + 0] - movu m3, [r2 + 16] - movu m2, [r2 + 32] - movu m0, [r2 + 48] - pxor m1, m4 + cmp r3, 4 * mmsize + jb .less_than_4x_mmsize + movu m1, [r2 + 0 * mmsize] + movu m3, [r2 + 1 * mmsize] + movu m2, [r2 + 2 * mmsize] + movu m0, [r2 + 3 * mmsize] + pxor m1, m4 %if %1 == 0 - pshufb m0, m10 - pshufb m1, m10 - pshufb m2, m10 - pshufb m3, m10 + pshufb m0, m10 + pshufb m1, m10 + pshufb m2, m10 + pshufb m3, m10 %endif - mov r4, 64 - cmp r3, 128 - jb .reduce_4x_to_1 - movu m4, [CTX] + mov r4, 4 * mmsize + cmp r3, 8 * mmsize + jb .reduce_4x_to_1 + VBROADCASTI32x4 m4, CTX + 64, CTX .fold_4x_loop: - movu m6, [r2 + r4 + 0] - movu m7, [r2 + r4 + 16] - movu m8, [r2 + r4 + 32] - movu m9, [r2 + r4 + 48] + movu m6, [r2 + r4 + 0 * mmsize] + movu m7, [r2 + r4 + 1 * mmsize] + movu m8, [r2 + r4 + 2 * mmsize] + movu m9, [r2 + r4 + 3 * mmsize] %if %1 == 0 pshufb m6, m10 pshufb m7, m10 @@ -195,22 +226,26 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10 FOLD_SINGLE m5, m3, m4, m7 FOLD_SINGLE m5, m2, m4, m8 FOLD_SINGLE m5, m0, m4, m9 - add r4, 64 - lea r5, [r4 + 64] + add r4, 4 * mmsize + lea r5, [r4 + 4 * mmsize] cmp r5, r3 jbe .fold_4x_loop .reduce_4x_to_1: - movu m4, [CTX + 16] - FOLD_SINGLE m5, m1, m4, m3 - FOLD_SINGLE m5, m1, m4, m2 - FOLD_SINGLE m5, m1, m4, m0 + VBROADCASTI32x4 m4, CTX, CTX + 16 + FOLD_SINGLE m5, m1, m4, m3 + FOLD_SINGLE m5, m1, m4, m2 + FOLD_SINGLE m5, m1, m4, m0 %endif .fold_1x_pre: - lea r5, [r4 + 16] + lea r5, [r4 + mmsize] cmp r5, r3 +%if mmsize == 64 + ja .fold_zmm_to_xmm +%else ja .partial_block +%endif .fold_1x_loop: movu m2, [r2 + r4] @@ -218,81 +253,141 @@ cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10 pshufb m2, m10 %endif FOLD_SINGLE m5, m1, m4, m2 - add r4, 16 - lea r5, [r4 + 16] + add r4, mmsize + lea r5, [r4 + mmsize] cmp r5, r3 jbe .fold_1x_loop +%if mmsize == 64 +.fold_zmm_to_xmm: + movu xm4, [CTX + 16] + vextracti32x4 xm0, m1, 1 + vextracti32x4 xm2, m1, 2 + vextracti32x4 xm3, m1, 3 + FOLD_SINGLE xm5, xm1, xm4, xm0 + FOLD_SINGLE xm5, xm1, xm4, xm2 + FOLD_SINGLE xm5, xm1, xm4, xm3 + +.fold_16b_pre: + lea r5, [r4 + 16] + cmp r5, r3 + ja .partial_block + +.fold_16b_loop: + movu xm2, [r2 + r4] +%if %1 == 0 + pshufb xm2, xm10 +%endif + FOLD_SINGLE xm5, xm1, xm4, xm2 + add r4, 16 + lea r5, [r4 + 16] + cmp r5, r3 + jbe .fold_16b_loop +%endif + .partial_block: - cmp r4, r3 - jae .reduce_128_to_64 - movu m2, [r2 + r3 - 16] - and r3, 0xf - lea r4, [partial_bytes_shuf_tab] - movu m0, [r3 + r4] + cmp r4, r3 + jae .reduce_128_to_64 + movu xm2, [r2 + r3 - 16] + and r3, 0xf + lea r4, [partial_bytes_shuf_tab] + movu xm0, [r3 + r4] %if %1 == 0 - pshufb m1, m10 + pshufb xm1, xm10 %endif - mova m3, m1 - pcmpeqd m5, m5 ; m5 = _mm_set1_epi8(0xff) - pxor m5, m0 - pshufb m3, m5 - pblendvb m2, m3, m0 - pshufb m1, m0 + mova xm3, xm1 +%if mmsize == 64 + mova xm5, xm0 + vpternlogq xm5, xm0, xm0, 0xf ; xm5 = ~xm0 + vpmovb2m k1, xm0 + pshufb xm3, xm5 + vpblendmb xm2{k1}, xm2, xm3 +%else + pcmpeqd xm5, xm5 ; m5 = _mm_set1_epi8(0xff) + pxor xm5, xm0 + pshufb xm3, xm5 + pblendvb xm2, xm3, xm0 +%endif + pshufb xm1, xm0 %if %1 == 0 - pshufb m1, m10 - pshufb m2, m10 + pshufb xm1, xm10 + pshufb xm2, xm10 %endif - FOLD_SINGLE m5, m1, m4, m2 + FOLD_SINGLE xm5, xm1, xm4, xm2 .reduce_128_to_64: - movu m4, [CTX + 32] - FOLD_128_TO_64 %1, m1, m4, m5 + movu xm4, [CTX + 32] + FOLD_128_TO_64 %1, xm1, xm4, xm5 .reduce_64_to_32: - movu m4, [CTX + 48] - FOLD_64_TO_32 %1, m1, m4, m5 + movu xm4, [CTX + 48] + FOLD_64_TO_32 %1, xm1, xm4, xm5 RET -.less_than_64bytes: - cmp r3, 16 - jb .less_than_16bytes - movu m1, [r2] - pxor m1, m4 +.less_than_4x_mmsize: + cmp r3, mmsize + jb .less_than_mmsize + movu m1, [r2] + pxor m1, m4 %if %1 == 0 - pshufb m1, m10 + pshufb m1, m10 %endif - mov r4, 16 - movu m4, [CTX + 16] - jmp .fold_1x_pre + mov r4, mmsize + VBROADCASTI32x4 m4, CTX, CTX + 16 + jmp .fold_1x_pre + +.less_than_mmsize: +%if mmsize == 64 + cmp r3, 16 + jb .less_than_16bytes + movu xm1, [r2] + pxor xm1, xm4 +%if %1 == 0 + pshufb xm1, xm10 +%endif + mov r4, 16 + movu xm4, [CTX + 16] + jmp .fold_16b_pre .less_than_16bytes: - pxor m1, m1 - movu [rsp], m1 - MEMCPY_0_15 rsp, r2, r3, r1, r4, .memcpy_done + mov r4d, -1 + shlx r4d, r4d, r3d + not r4d + kmovw k1, r4d + vmovdqu8 xm1{k1}{z}, [r2] +%else + pxor m1, m1 + movu [rsp], m1 + MEMCPY_0_15 rsp, r2, r3, r1, r4, .memcpy_done .memcpy_done: - movu m1, [rsp] - pxor m1, m4 - cmp r3, 5 - jb .less_than_5bytes - XMM_SHIFT_LEFT m1, (16 - r3), m2, r4 -%if %1 == 0 - pshufb m1, m10 + movu m1, [rsp] %endif - jmp .reduce_128_to_64 + + pxor xm1, xm4 + cmp r3, 5 + jb .less_than_5bytes + XMM_SHIFT_LEFT xm1, (16 - r3), xm2, r4 +%if %1 == 0 + pshufb xm1, xm10 +%endif + jmp .reduce_128_to_64 .less_than_5bytes: %if %1 == 0 - XMM_SHIFT_LEFT m1, (4 - r3), m2, r4 - movq m10, [reverse_shuffle + 8] ; 0x0001020304050607 - pshufb m1, m10 + XMM_SHIFT_LEFT xm1, (4 - r3), xm2, r4 + movq xm10, [reverse_shuffle + 8] ; 0x0001020304050607 + pshufb xm1, xm10 %else - XMM_SHIFT_LEFT m1, (8 - r3), m2, r4 + XMM_SHIFT_LEFT xm1, (8 - r3), xm2, r4 %endif - jmp .reduce_64_to_32 + jmp .reduce_64_to_32 %endmacro INIT_XMM clmul CRC 0 CRC 1 + +INIT_ZMM avx512icl +CRC 0 +CRC 1 diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h index ef98ed318d..443fe753ac 100644 --- a/libavutil/x86/crc.h +++ b/libavutil/x86/crc.h @@ -30,6 +30,14 @@ #include "libavutil/intreadwrite.h" #include "libavutil/x86/cpu.h" +enum { + CRC_C = 0, + CLMUL_BE, + CLMUL_LE, + CLMUL_BE_AVX512ICL, + CLMUL_LE_AVX512ICL, +}; + #if HAVE_CLMUL_EXTERNAL #include "libavutil/crc_internal.h" @@ -40,12 +48,6 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length); FF_VISIBILITY_POP_HIDDEN -enum { - CRC_C = 0, - CLMUL_BE, - CLMUL_LE, -}; - static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = { [AV_CRC_8_ATM] = { CLMUL_BE, @@ -142,11 +144,135 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int } #endif +#if HAVE_AVX512ICL_EXTERNAL +#include "libavutil/crc_internal.h" + +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc_avx512icl(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +uint32_t ff_crc_le_avx512icl(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +FF_VISIBILITY_POP_HIDDEN + +static const AVCRC crc_table_avx512icl[AV_CRC_MAX][21] = { + [AV_CRC_8_ATM] = { + CLMUL_BE_AVX512ICL, + 0x32000000, 0x0, 0xbc000000, 0x0, + 0xc4000000, 0x0, 0x94000000, 0x0, + 0x62000000, 0x0, 0x79000000, 0x0, + 0x07156a16, 0x1, 0x07000000, 0x1, + 0xdf000000, 0x0, 0xd9000000, 0x0, + }, + [AV_CRC_8_EBU] = { + CLMUL_BE_AVX512ICL, + 0xb5000000, 0x0, 0xf3000000, 0x0, + 0xfc000000, 0x0, 0x0d000000, 0x0, + 0x6a000000, 0x0, 0x65000000, 0x0, + 0x1c4b8192, 0x1, 0x1d000000, 0x1, + 0x46000000, 0x0, 0x16000000, 0x0, + }, + [AV_CRC_16_ANSI] = { + CLMUL_BE_AVX512ICL, + 0xf9e30000, 0x0, 0x807d0000, 0x0, + 0xf9130000, 0x0, 0xff830000, 0x0, + 0x807b0000, 0x0, 0x86630000, 0x0, + 0xfffbffe7, 0x1, 0x80050000, 0x1, + 0xfe630000, 0x0, 0x7f870000, 0x0, + }, + [AV_CRC_16_CCITT] = { + CLMUL_BE_AVX512ICL, + 0x60190000, 0x0, 0x59b00000, 0x0, + 0xd5f60000, 0x0, 0x45630000, 0x0, + 0xaa510000, 0x0, 0xeb230000, 0x0, + 0x11303471, 0x1, 0x10210000, 0x1, + 0xcacd0000, 0x0, 0x16270000, 0x0, + }, + [AV_CRC_24_IEEE] = { + CLMUL_BE_AVX512ICL, + 0x1f428700, 0x0, 0x467d2400, 0x0, + 0x2c8c9d00, 0x0, 0x64e4d700, 0x0, + 0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0, + 0xf845fe24, 0x1, 0x864cfb00, 0x1, + 0x09e45400, 0x0, 0xa79dfd00, 0x0, + }, + [AV_CRC_32_IEEE] = { + CLMUL_BE_AVX512ICL, + 0x8833794c, 0x0, 0xe6228b11, 0x0, + 0xc5b9cd4c, 0x0, 0xe8a45605, 0x0, + 0x490d678d, 0x0, 0xf200aa66, 0x0, + 0x04d101df, 0x1, 0x04c11db7, 0x1, + 0xcbcf3bcb, 0x0, 0x88fe2237, 0x0, + }, + [AV_CRC_32_IEEE_LE] = { + CLMUL_LE_AVX512ICL, + 0xc6e41596, 0x1, 0x54442bd4, 0x1, + 0xccaa009e, 0x0, 0x751997d0, 0x1, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, + 0x322d1430, 0x1, 0x1542778a, 0x1, + }, + [AV_CRC_16_ANSI_LE] = { + CLMUL_LE_AVX512ICL, + 0x0000bffa, 0x0, 0x1b0c2, 0x0, + 0x00018cc2, 0x0, 0x1d0c2, 0x0, + 0x00018cc2, 0x0, 0x1bc02, 0x0, + 0xcfffbffe, 0x1, 0x14003, 0x0, + 0x0001d99e, 0x0, 0x1bcc2, 0x0, + }, +}; + +static inline void crc_init_x86_avx512icl(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ + uint64_t poly_; + if (le) { + // convert the reversed representation to regular form + poly = reverse(poly, bits) >> 1; + } + // convert to 32 degree polynomial + poly_ = ((uint64_t)poly) << (32 - bits); + + uint64_t div; + uint8_t *dst = (uint8_t*)(ctx + 1); + if (le) { + ctx[0] = CLMUL_LE_AVX512ICL; + AV_WN64(dst, xnmodp(4 * 128 - 32, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128 + 32, poly_, 32, &div, le)); + uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le); + AV_WN64(dst + 16, tmp); + AV_WN64(dst + 24, xnmodp(128 + 32, poly_, 32, &div, le)); + AV_WN64(dst + 32, tmp); + AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32)); + AV_WN64(dst + 64, xnmodp(4 * 512 - 32, poly_, 32, &div, le)); + AV_WN64(dst + 72, xnmodp(4 * 512 + 32, poly_, 32, &div, le)); + } else { + ctx[0] = CLMUL_BE_AVX512ICL; + AV_WN64(dst, xnmodp(4 * 128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128, poly_, 32, &div, le)); + AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 24, xnmodp(128, poly_, 32, &div, le)); + AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le)); + AV_WN64(dst + 56, poly_ | (1ULL << 32)); + AV_WN64(dst + 64, xnmodp(4 * 512 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 72, xnmodp(4 * 512, poly_, 32, &div, le)); + } +} +#endif + static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id) { -#if HAVE_CLMUL_EXTERNAL int cpu_flags = av_get_cpu_flags(); +#if HAVE_AVX512ICL_EXTERNAL + if (EXTERNAL_AVX512ICL(cpu_flags)) { + return crc_table_avx512icl[crc_id]; + } +#endif + +#if HAVE_CLMUL_EXTERNAL if (EXTERNAL_CLMUL(cpu_flags)) { return crc_table_clmul[crc_id]; } @@ -156,9 +282,16 @@ static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id) static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) { -#if HAVE_CLMUL_EXTERNAL int cpu_flags = av_get_cpu_flags(); +#if HAVE_AVX512ICL_EXTERNAL + if (EXTERNAL_AVX512ICL(cpu_flags)) { + crc_init_x86_avx512icl(ctx, le, bits, poly, ctx_size); + return 1; + } +#endif + +#if HAVE_CLMUL_EXTERNAL if (EXTERNAL_CLMUL(cpu_flags)) { crc_init_x86(ctx, le, bits, poly, ctx_size); return 1; @@ -175,7 +308,12 @@ static inline uint32_t ff_crc_x86(const AVCRC *ctx, uint32_t crc, case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length); case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length); #endif - default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE"); + +#if HAVE_AVX512ICL_EXTERNAL + case CLMUL_BE_AVX512ICL: return ff_crc_avx512icl(ctx, crc, buffer, length); + case CLMUL_LE_AVX512ICL: return ff_crc_le_avx512icl(ctx, crc, buffer, length); +#endif + default: av_unreachable("x86 CRC only uses CLMUL_BE, CLMUL_LE, CLMUL_BE_AVX512ICL and CLMUL_LE_AVX512ICL"); } return 0; } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
