PR #21119 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119.patch
This adds the clmul implementations from #20751 without the avpriv function pointer indirection. It also adds a checkasm test for this. I can also add the AVX512 implementation from said PR if people agree on this approach. >From def67a6565588d8554c1cecb0d2c0ac3d5ffa011 Mon Sep 17 00:00:00 2001 From: Shreesh Adiga <[email protected]> Date: Sun, 26 Oct 2025 10:45:55 +0530 Subject: [PATCH 1/5] avutil/cpu: add x86 CPU feature flag for clmul --- configure | 4 ++++ libavutil/cpu.c | 1 + libavutil/cpu.h | 1 + libavutil/tests/cpu.c | 1 + libavutil/x86/cpu.c | 2 ++ libavutil/x86/cpu.h | 2 ++ tests/checkasm/checkasm.c | 1 + 7 files changed, 12 insertions(+) diff --git a/configure b/configure index 04e086c32a..abdd129fa2 100755 --- a/configure +++ b/configure @@ -469,6 +469,7 @@ Optimization options (experts only): --disable-avx512 disable AVX-512 optimizations --disable-avx512icl disable AVX-512ICL optimizations --disable-aesni disable AESNI optimizations + --disable-clmul disable CLMUL optimizations --disable-armv5te disable armv5te optimizations --disable-armv6 disable armv6 optimizations --disable-armv6t2 disable armv6t2 optimizations @@ -2252,6 +2253,7 @@ ARCH_EXT_LIST_WASM=" ARCH_EXT_LIST_X86_SIMD=" aesni + clmul amd3dnow amd3dnowext avx @@ -2870,6 +2872,7 @@ ssse3_deps="sse3" sse4_deps="ssse3" sse42_deps="sse4" aesni_deps="sse42" +clmul_deps="sse42" avx_deps="sse42" xop_deps="avx" fma3_deps="avx" @@ -8191,6 +8194,7 @@ if enabled x86; then echo "SSE enabled ${sse-no}" echo "SSSE3 enabled ${ssse3-no}" echo "AESNI enabled ${aesni-no}" + echo "CLMUL enabled ${clmul-no}" echo "AVX enabled ${avx-no}" echo "AVX2 enabled ${avx2-no}" echo "AVX-512 enabled ${avx512-no}" diff --git a/libavutil/cpu.c b/libavutil/cpu.c index 8f9b785ebc..0ddbc50da5 100644 --- a/libavutil/cpu.c +++ b/libavutil/cpu.c @@ -149,6 +149,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s) { "3dnowext", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_3DNOWEXT }, .unit = "flags" }, { "cmov", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CMOV }, .unit = "flags" }, { "aesni", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AESNI }, .unit = "flags" }, + { "clmul", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CLMUL }, .unit = "flags" }, { "avx512" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512 }, .unit = "flags" }, { "avx512icl", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512ICL }, .unit = "flags" }, { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" }, diff --git a/libavutil/cpu.h b/libavutil/cpu.h index a06fc08e56..600754867a 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -47,6 +47,7 @@ #define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions #define AV_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions #define AV_CPU_FLAG_AESNI 0x80000 ///< Advanced Encryption Standard functions +#define AV_CPU_FLAG_CLMUL 0x400000 ///< Carry-less Multiplication instruction #define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used #define AV_CPU_FLAG_AVXSLOW 0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer) #define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c index fd2e32901d..6f8a0be2c3 100644 --- a/libavutil/tests/cpu.c +++ b/libavutil/tests/cpu.c @@ -88,6 +88,7 @@ static const struct { { AV_CPU_FLAG_BMI1, "bmi1" }, { AV_CPU_FLAG_BMI2, "bmi2" }, { AV_CPU_FLAG_AESNI, "aesni" }, + { AV_CPU_FLAG_CLMUL, "clmul" }, { AV_CPU_FLAG_AVX512, "avx512" }, { AV_CPU_FLAG_AVX512ICL, "avx512icl" }, { AV_CPU_FLAG_SLOW_GATHER, "slowgather" }, diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index 1a592f3bf4..5563f6cc3b 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -121,6 +121,8 @@ int ff_get_cpu_flags_x86(void) rval |= AV_CPU_FLAG_SSE2; if (ecx & 1) rval |= AV_CPU_FLAG_SSE3; + if (ecx & 0x2) + rval |= AV_CPU_FLAG_CLMUL; if (ecx & 0x00000200 ) rval |= AV_CPU_FLAG_SSSE3; if (ecx & 0x00080000 ) diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index 00e82255b1..af081b2ed8 100644 --- a/libavutil/x86/cpu.h +++ b/libavutil/x86/cpu.h @@ -44,6 +44,7 @@ #define X86_FMA4(flags) CPUEXT(flags, FMA4) #define X86_AVX2(flags) CPUEXT(flags, AVX2) #define X86_AESNI(flags) CPUEXT(flags, AESNI) +#define X86_CLMUL(flags) CPUEXT(flags, CLMUL) #define X86_AVX512(flags) CPUEXT(flags, AVX512) #define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX) @@ -72,6 +73,7 @@ #define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX) #define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX) #define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI) +#define EXTERNAL_CLMUL(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, CLMUL) #define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512) #define EXTERNAL_AVX512ICL(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512ICL) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 7edc8e4e6e..80b08a2532 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -395,6 +395,7 @@ static const struct { { "SSE4.1", "sse4", AV_CPU_FLAG_SSE4 }, { "SSE4.2", "sse42", AV_CPU_FLAG_SSE42 }, { "AES-NI", "aesni", AV_CPU_FLAG_AESNI }, + { "CLMUL", "clmul", AV_CPU_FLAG_CLMUL }, { "AVX", "avx", AV_CPU_FLAG_AVX }, { "XOP", "xop", AV_CPU_FLAG_XOP }, { "FMA3", "fma3", AV_CPU_FLAG_FMA3 }, -- 2.49.1 >From cf85126b10fc5658f11af9b363919c5d55ec0054 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 7 Dec 2025 10:38:05 +0100 Subject: [PATCH 2/5] tests/checkasm: Add support for using opaques to decide whether to test This is in preparation for adding checkasm support for av_crc(), which will always call the same function, but uses different CRC tables to distinguish different implementations. This reuses checkasm_check_func() for this; one could also add a new function or use unions. This would allow to avoid casting const away in the crc test to be added. It would also allow to avoid converting function pointers to void* (which ISO C does not allow). Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/checkasm.h | 44 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 9f4fb8b283..d1d524c730 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -199,6 +199,7 @@ extern uint64_t bench_runs; /* Decide whether or not the specified function needs to be tested */ #define check_func(func, ...) (checkasm_save_context(), func_ref = checkasm_check_func((func_new = func), __VA_ARGS__)) +#define check_opaque(opaque, ...) (checkasm_save_context(), checkasm_check_func(opaque, __VA_ARGS__)) /* Declare the function prototype. The first argument is the return value, the remaining * arguments are the function parameters. Naming parameters is optional. */ @@ -214,10 +215,15 @@ extern uint64_t bench_runs; /* Call the reference function */ #define call_ref(...)\ + call_ref_ext((func_type *)func_ref, __VA_ARGS__) + +#define call_ref_ext(func, ...) \ (checkasm_set_signal_handler_state(1),\ - ((func_type *)func_ref)(__VA_ARGS__));\ + (func)(__VA_ARGS__));\ checkasm_set_signal_handler_state(0) +#define call_new(...) call_new_ext(((func_type *)func_new), __VA_ARGS__) + #if ARCH_X86 && HAVE_X86ASM /* Verifies that clobbered callee-saved registers are properly saved and restored * and that either no MMX registers are touched or emms is issued */ @@ -249,10 +255,10 @@ void checkasm_stack_clobber(uint64_t clobber, ...); ((cpu_flags) & av_get_cpu_flags()) ? (void *)checkasm_checked_call_emms : \ (void *)checkasm_checked_call; #define CLOB (UINT64_C(0xdeadbeefdeadbeef)) -#define call_new(...) (checkasm_set_signal_handler_state(1),\ +#define call_new_ext(func, ...) (checkasm_set_signal_handler_state(1),\ checkasm_stack_clobber(CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,\ CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB),\ - checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__));\ + checked_call((func), 0, 0, 0, 0, 0, __VA_ARGS__));\ checkasm_set_signal_handler_state(0) #elif ARCH_X86_32 #define declare_new(ret, ...) ret (*checked_call)(void *, __VA_ARGS__) = (void *)checkasm_checked_call; @@ -260,9 +266,9 @@ void checkasm_stack_clobber(uint64_t clobber, ...); #define declare_new_emms(cpu_flags, ret, ...) ret (*checked_call)(void *, __VA_ARGS__) = \ ((cpu_flags) & av_get_cpu_flags()) ? (void *)checkasm_checked_call_emms : \ (void *)checkasm_checked_call; -#define call_new(...)\ +#define call_new_ext(func, ...)\ (checkasm_set_signal_handler_state(1),\ - checked_call(func_new, __VA_ARGS__));\ + checked_call((func), __VA_ARGS__));\ checkasm_set_signal_handler_state(0) #endif #elif ARCH_ARM && HAVE_ARMV5TE_EXTERNAL @@ -275,9 +281,9 @@ extern void (*checkasm_checked_call)(void *func, int dummy, ...); #define declare_new(ret, ...) ret (*checked_call)(void *, int dummy, __VA_ARGS__, \ int, int, int, int, int, int, int, int, \ int, int, int, int, int, int, int) = (void *)checkasm_checked_call; -#define call_new(...) \ +#define call_new_ext(func, ...) \ (checkasm_set_signal_handler_state(1),\ - checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\ + checked_call((func), 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\ checkasm_set_signal_handler_state(0) #elif ARCH_AARCH64 && !defined(__APPLE__) void checkasm_stack_clobber(uint64_t clobber, ...); @@ -287,10 +293,10 @@ void checkasm_checked_call(void *func, ...); int, int, int, int, int, int, int)\ = (void *)checkasm_checked_call; #define CLOB (UINT64_C(0xdeadbeefdeadbeef)) -#define call_new(...) (checkasm_set_signal_handler_state(1),\ +#define call_new_ext(func, ...) (checkasm_set_signal_handler_state(1),\ checkasm_stack_clobber(CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,\ CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB),\ - checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ + checked_call((func), 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\ checkasm_set_signal_handler_state(0) #elif ARCH_RISCV @@ -300,15 +306,15 @@ void *checkasm_get_wrapper(void); #if HAVE_RV && (__riscv_xlen == 64) && defined (__riscv_d) #define declare_new(ret, ...) \ ret (*checked_call)(__VA_ARGS__) = checkasm_get_wrapper(); -#define call_new(...) \ +#define call_new_ext(func, ...) \ (checkasm_set_signal_handler_state(1),\ - checkasm_set_function(func_new), checked_call(__VA_ARGS__));\ + checkasm_set_function(func), checked_call(__VA_ARGS__));\ checkasm_set_signal_handler_state(0) #else #define declare_new(ret, ...) -#define call_new(...)\ +#define call_new_ext(func, ...)\ (checkasm_set_signal_handler_state(1),\ - ((func_type *)func_new)(__VA_ARGS__));\ + (func)(__VA_ARGS__));\ checkasm_set_signal_handler_state(0) #endif #else @@ -316,9 +322,9 @@ void *checkasm_get_wrapper(void); #define declare_new_float(ret, ...) #define declare_new_emms(cpu_flags, ret, ...) /* Call the function */ -#define call_new(...)\ +#define call_new_ext(func, ...)\ (checkasm_set_signal_handler_state(1),\ - ((func_type *)func_new)(__VA_ARGS__));\ + (func)(__VA_ARGS__));\ checkasm_set_signal_handler_state(0) #endif @@ -373,12 +379,12 @@ typedef struct CheckasmPerf { } while (0) /* Benchmark the function */ -#define bench_new(...)\ +#define bench(func, ...)\ do {\ if (checkasm_bench_func()) {\ struct CheckasmPerf *perf = checkasm_get_perf_context();\ av_unused const int sysfd = perf->sysfd;\ - func_type *tfunc = func_new;\ + func_type *tfunc = func;\ uint64_t tsum = 0;\ uint64_t ti, tcount = 0;\ uint64_t t = 0; \ @@ -401,11 +407,13 @@ typedef struct CheckasmPerf { }\ } while (0) #else -#define bench_new(...) while(0) +#define bench(func, ...) while(0) #define PERF_START(t) while(0) #define PERF_STOP(t) while(0) #endif +#define bench_new(...) bench(func_new, __VA_ARGS__) + #define BUF_RECT(type, name, w, h) \ LOCAL_ALIGNED_32(type, name##_buf, [((h)+32)*(FFALIGN(w,64)+64) + 64]); \ av_unused ptrdiff_t name##_stride = sizeof(type)*(FFALIGN(w,64)+64); \ -- 2.49.1 >From 814b28d8adaf3e3db33464dd2f15fea1b494df92 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 7 Dec 2025 10:50:39 +0100 Subject: [PATCH 3/5] tests/checkasm: Add CRC test Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + tests/checkasm/crc.c | 68 +++++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 72 insertions(+) create mode 100644 tests/checkasm/crc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 1c34619249..014fccc1df 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -91,6 +91,7 @@ CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS) # libavutil tests AVUTILOBJS += aes.o AVUTILOBJS += av_tx.o +AVUTILOBJS += crc.o AVUTILOBJS += fixed_dsp.o AVUTILOBJS += float_dsp.o AVUTILOBJS += lls.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 80b08a2532..1147d82903 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -337,6 +337,7 @@ static const struct { #endif #if CONFIG_AVUTIL { "aes", checkasm_check_aes }, + { "crc", checkasm_check_crc }, { "fixed_dsp", checkasm_check_fixed_dsp }, { "float_dsp", checkasm_check_float_dsp }, { "lls", checkasm_check_lls }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index d1d524c730..24ba15493b 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -93,6 +93,7 @@ void checkasm_check_bswapdsp(void); void checkasm_check_cavsdsp(void); void checkasm_check_colordetect(void); void checkasm_check_colorspace(void); +void checkasm_check_crc(void); void checkasm_check_dcadsp(void); void checkasm_check_diracdsp(void); void checkasm_check_exrdsp(void); diff --git a/tests/checkasm/crc.c b/tests/checkasm/crc.c new file mode 100644 index 0000000000..d5380ee4da --- /dev/null +++ b/tests/checkasm/crc.c @@ -0,0 +1,68 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stddef.h> +#include <stdint.h> + + +#include "checkasm.h" +#include "libavutil/attributes.h" +// Undefine av_pure so that calls to av_crc are not optimized away. +#undef av_pure +#define av_pure +#include "libavutil/crc.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + + +void checkasm_check_crc(void) +{ + declare_func(uint32_t, const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); + + for (unsigned i = 0; i < AV_CRC_MAX; ++i) { + const AVCRC *table_new = av_crc_get_table(i); + const AVCRC *table_ref; + + if (table_ref = check_opaque((AVCRC*)table_new, "crc_%u", i)) { + DECLARE_ALIGNED(4, uint8_t, buf)[8192]; + size_t offset = rnd() & 31; + static size_t sizes[AV_CRC_MAX]; + static unsigned sizes_initialized = 0; + uint32_t prev_crc = rnd(); + + if (!(sizes_initialized & (1 << i))) { + sizes_initialized |= 1 << i; + sizes[i] = rnd() % (sizeof(buf) - 1 - offset); + } + + size_t size = sizes[i]; + + for (size_t j = 0; j < sizeof(buf); j += 4) + AV_WN32A(buf + j, rnd()); + + uint32_t crc_ref = call_ref_ext(av_crc, table_ref, prev_crc, buf + offset, size); + uint32_t crc_new = call_new_ext(av_crc, table_new, prev_crc, buf + offset, size); + + if (crc_ref != crc_new) + fail(); + + bench(av_crc, table_new, prev_crc, buf + offset, size); + } + } +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index f26e534591..3e97cd43c2 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -10,6 +10,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-blockdsp \ fate-checkasm-bswapdsp \ fate-checkasm-cavsdsp \ + fate-checkasm-crc \ fate-checkasm-dcadsp \ fate-checkasm-diracdsp \ fate-checkasm-exrdsp \ -- 2.49.1 >From 42276817396f2a6c167d6d1e3a96e6d6921ecfc1 Mon Sep 17 00:00:00 2001 From: Shreesh Adiga <[email protected]> Date: Sun, 26 Oct 2025 16:07:17 +0530 Subject: [PATCH 4/5] avutil/crc: add x86 SSE4.2 clmul SIMD implementation for av_crc Implemented the algorithm described in the paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" by Intel. It is not used yet; the integration will be added in a separate commit. Observed near 10x speedup on AMD Zen4 7950x: av_crc_c: 22057.0 ( 1.00x) av_crc_clmul: 2202.8 (10.01x) --- libavutil/x86/crc.asm | 297 ++++++++++++++++++++++++++++++++++++++++++ libavutil/x86/crc.h | 180 +++++++++++++++++++++++++ 2 files changed, 477 insertions(+) create mode 100644 libavutil/x86/crc.asm create mode 100644 libavutil/x86/crc.h diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm new file mode 100644 index 0000000000..fd06acfdd2 --- /dev/null +++ b/libavutil/x86/crc.asm @@ -0,0 +1,297 @@ +;***************************************************************************** +;* Copyright (c) 2025 Shreesh Adiga <[email protected]> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION RODATA +reverse_shuffle: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +partial_bytes_shuf_tab: db 255, 254, 253, 252, 251, 250, 249, 248,\ + 247, 246, 245, 244, 243, 242, 241, 240,\ + 0, 1, 2, 3, 4, 5, 6, 7,\ + 8, 9, 10, 11, 12, 13, 14, 15 + +SECTION .text + +%macro FOLD_128_TO_64 4 +; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg +%if %1 == 1 + mova %4, %2 + pclmulqdq %2, %3, 0x00 + psrldq %4, 8 + pxor %2, %4 + mova %4, %2 + psllq %4, 32 + pclmulqdq %4, %3, 0x10 + pxor %2, %4 +%else + movq %4, %2 + pclmulqdq %2, %3, 0x11 + pslldq %4, 4 + pxor %4, %2 + mova %2, %4 + pclmulqdq %4, %3, 0x01 + pxor %2, %4 +%endif +%endmacro + +%macro FOLD_64_TO_32 4 +; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg +%if %1 == 1 + pxor %4, %4 + pblendw %4, %2, 0xfc + mova %2, %4 + pclmulqdq %4, %3, 0x00 + pxor %4, %2 + pclmulqdq %4, %3, 0x10 + pxor %2, %4 + pextrd eax, %2, 2 +%else + mova %4, %2 + pclmulqdq %2, %3, 0x00 + pclmulqdq %2, %3, 0x11 + pxor %2, %4 + movd eax, %2 + bswap eax +%endif +%endmacro + +%macro FOLD_SINGLE 4 +; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block + mova %1, %2 + pclmulqdq %1, %3, 0x01 + pxor %1, %4 + pclmulqdq %2, %3, 0x10 + pxor %2, %1 +%endmacro + +%macro XMM_SHIFT_LEFT 4 +; %1 xmm input reg ; %2 shift bytes amount ; %3 temp xmm register ; %4 temp gpr + lea %4, [partial_bytes_shuf_tab] + movu %3, [%4 + 16 - (%2)] + pshufb %1, %3 +%endmacro + +%macro MEMCPY_0_15 6 +; %1 dst ; %2 src ; %3 len ; %4, %5 temp gpr register; %6 done label + cmp %3, 8 + jae .between_8_15 + cmp %3, 4 + jae .between_4_7 + cmp %3, 1 + ja .between_2_3 + jb %6 + mov %4b, [%2] + mov [%1], %4b + jmp %6 + +.between_8_15: +%if ARCH_X86_64 + mov %4q, [%2] + mov %5q, [%2 + %3 - 8] + mov [%1], %4q + mov [%1 + %3 - 8], %5q + jmp %6 +%else + xor %5, %5 +.copy4b: + mov %4d, [%2 + %5] + mov [%1 + %5], %4d + add %5, 4 + lea %4, [%5 + 4] + cmp %4, %3 + jb .copy4b + + mov %4d, [%2 + %3 - 4] + mov [%1 + %3 - 4], %4d + jmp %6 +%endif +.between_4_7: + mov %4d, [%2] + mov %5d, [%2 + %3 - 4] + mov [%1], %4d + mov [%1 + %3 - 4], %5d + jmp %6 +.between_2_3: + mov %4w, [%2] + mov %5w, [%2 + %3 - 2] + mov [%1], %4w + mov [%1 + %3 - 2], %5w + ; fall through, %6 label is expected to be next instruction +%endmacro + +%macro CRC 1 +;----------------------------------------------------------------------------------------------- +; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length +;----------------------------------------------------------------------------------------------- +; %1 == 1 - LE format +%if %1 == 1 +cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0x10 +%else +cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 +%endif + +%if ARCH_X86_32 + %define m10 m6 +%endif + +%if %1 == 0 + movu m10, [reverse_shuffle] +%endif + + movd m4, r1d +%if ARCH_X86_32 + ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32 + jmp .less_than_64bytes +%else + cmp r3, 64 + jb .less_than_64bytes + movu m1, [r2 + 0] + movu m3, [r2 + 16] + movu m2, [r2 + 32] + movu m0, [r2 + 48] + pxor m1, m4 +%if %1 == 0 + pshufb m0, m10 + pshufb m1, m10 + pshufb m2, m10 + pshufb m3, m10 +%endif + mov r4, 64 + cmp r3, 128 + jb .reduce_4x_to_1 + movu m4, [r0] + +.fold_4x_loop: + movu m6, [r2 + r4 + 0] + movu m7, [r2 + r4 + 16] + movu m8, [r2 + r4 + 32] + movu m9, [r2 + r4 + 48] +%if %1 == 0 + pshufb m6, m10 + pshufb m7, m10 + pshufb m8, m10 + pshufb m9, m10 +%endif + FOLD_SINGLE m5, m1, m4, m6 + FOLD_SINGLE m5, m3, m4, m7 + FOLD_SINGLE m5, m2, m4, m8 + FOLD_SINGLE m5, m0, m4, m9 + add r4, 64 + lea r5, [r4 + 64] + cmp r5, r3 + jbe .fold_4x_loop + +.reduce_4x_to_1: + movu m4, [r0 + 16] + FOLD_SINGLE m5, m1, m4, m3 + FOLD_SINGLE m5, m1, m4, m2 + FOLD_SINGLE m5, m1, m4, m0 +%endif + +.fold_1x_pre: + lea r5, [r4 + 16] + cmp r5, r3 + ja .partial_block + +.fold_1x_loop: + movu m2, [r2 + r4] +%if %1 == 0 + pshufb m2, m10 +%endif + FOLD_SINGLE m5, m1, m4, m2 + add r4, 16 + lea r5, [r4 + 16] + cmp r5, r3 + jbe .fold_1x_loop + +.partial_block: + cmp r4, r3 + jae .reduce_128_to_64 + movu m2, [r2 + r3 - 16] + and r3, 0xf + lea r4, [partial_bytes_shuf_tab] + movu m0, [r3 + r4] +%if %1 == 0 + pshufb m1, m10 +%endif + mova m3, m1 + pcmpeqd m5, m5 ; m5 = _mm_set1_epi8(0xff) + pxor m5, m0 + pshufb m3, m5 + pblendvb m2, m3, m0 + pshufb m1, m0 +%if %1 == 0 + pshufb m1, m10 + pshufb m2, m10 +%endif + FOLD_SINGLE m5, m1, m4, m2 + +.reduce_128_to_64: + movu m4, [r0 + 32] + FOLD_128_TO_64 %1, m1, m4, m5 +.reduce_64_to_32: + movu m4, [r0 + 48] + FOLD_64_TO_32 %1, m1, m4, m5 + RET + +.less_than_64bytes: + cmp r3, 16 + jb .less_than_16bytes + movu m1, [r2] + pxor m1, m4 +%if %1 == 0 + pshufb m1, m10 +%endif + mov r4, 16 + movu m4, [r0 + 16] + jmp .fold_1x_pre + +.less_than_16bytes: + pxor m1, m1 + movu [rsp], m1 + MEMCPY_0_15 rsp, r2, r3, r1, r4, .memcpy_done + +.memcpy_done: + movu m1, [rsp] + pxor m1, m4 + cmp r3, 5 + jb .less_than_5bytes + XMM_SHIFT_LEFT m1, (16 - r3), m2, r4 +%if %1 == 0 + pshufb m1, m10 +%endif + jmp .reduce_128_to_64 + +.less_than_5bytes: +%if %1 == 0 + XMM_SHIFT_LEFT m1, (4 - r3), m2, r4 + movq m10, [reverse_shuffle + 8] ; 0x0001020304050607 + pshufb m1, m10 +%else + XMM_SHIFT_LEFT m1, (8 - r3), m2, r4 +%endif + jmp .reduce_64_to_32 + +%endmacro + +INIT_XMM clmul +CRC 0 +CRC 1 diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h new file mode 100644 index 0000000000..936ca54d37 --- /dev/null +++ b/libavutil/x86/crc.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2025 Shreesh Adiga <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_CRC_H +#define AVUTIL_X86_CRC_H + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/attributes_internal.h" +#include "libavutil/cpu.h" +#include "libavutil/crc.h" +#include "libavutil/reverse.h" +#include "libavutil/x86/cpu.h" + +#if HAVE_CLMUL_EXTERNAL +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc_clmul(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +FF_VISIBILITY_POP_HIDDEN + +static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = { + [AV_CRC_8_ATM] = { + 0x32000000, 0x0, 0xbc000000, 0x0, + 0xc4000000, 0x0, 0x94000000, 0x0, + 0x62000000, 0x0, 0x79000000, 0x0, + 0x07156a16, 0x1, 0x07000000, 0x1, + }, + [AV_CRC_8_EBU] = { + 0xb5000000, 0x0, 0xf3000000, 0x0, + 0xfc000000, 0x0, 0x0d000000, 0x0, + 0x6a000000, 0x0, 0x65000000, 0x0, + 0x1c4b8192, 0x1, 0x1d000000, 0x1, + }, + [AV_CRC_16_ANSI] = { + 0xf9e30000, 0x0, 0x807d0000, 0x0, + 0xf9130000, 0x0, 0xff830000, 0x0, + 0x807b0000, 0x0, 0x86630000, 0x0, + 0xfffbffe7, 0x1, 0x80050000, 0x1, + }, + [AV_CRC_16_CCITT] = { + 0x60190000, 0x0, 0x59b00000, 0x0, + 0xd5f60000, 0x0, 0x45630000, 0x0, + 0xaa510000, 0x0, 0xeb230000, 0x0, + 0x11303471, 0x1, 0x10210000, 0x1, + }, + [AV_CRC_24_IEEE] = { + 0x1f428700, 0x0, 0x467d2400, 0x0, + 0x2c8c9d00, 0x0, 0x64e4d700, 0x0, + 0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0, + 0xf845fe24, 0x1, 0x864cfb00, 0x1, + }, + [AV_CRC_32_IEEE] = { + 0x8833794c, 0x0, 0xe6228b11, 0x0, + 0xc5b9cd4c, 0x0, 0xe8a45605, 0x0, + 0x490d678d, 0x0, 0xf200aa66, 0x0, + 0x04d101df, 0x1, 0x04c11db7, 0x1, + }, + [AV_CRC_32_IEEE_LE] = { + 0xc6e41596, 0x1, 0x54442bd4, 0x1, + 0xccaa009e, 0x0, 0x751997d0, 0x1, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, + }, + [AV_CRC_16_ANSI_LE] = { + 0x0000bffa, 0x0, 0x1b0c2, 0x0, + 0x00018cc2, 0x0, 0x1d0c2, 0x0, + 0x00018cc2, 0x0, 0x1bc02, 0x0, + 0xcfffbffe, 0x1, 0x14003, 0x0, + }, +}; + +static uint64_t reverse(uint64_t p, unsigned int deg) { + uint64_t ret = 0; + int i; + for (i = 0; i < (deg / 8); i += 1) { + ret = (ret << 8) | (ff_reverse[p & 0xff]); + p >>= 8; + } + int rem = (deg + 1) - 8 * i; + ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem)); + return ret; +} + +static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, int bitreverse) +{ + uint64_t mod, mask, high; + + if (n < deg) { + *div = 0; + return poly; + } + mask = ((uint64_t)1 << deg) - 1; + poly &= mask; + mod = poly; + *div = 1; + deg--; + while (--n > deg) { + high = (mod >> deg) & 1; + *div = (*div << 1) | high; + mod <<= 1; + if (high) + mod ^= poly; + } + uint64_t ret = mod & mask; + if (bitreverse) { + *div = reverse(*div, deg) << 1; + return reverse(ret, deg) << 1; + } + return ret; +} + +static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ + uint64_t poly_; + if (le) { + // convert the reversed representation to regular form + poly = reverse(poly, bits) >> 1; + } + // convert to 32 degree polynomial + poly_ = ((uint64_t)poly) << (32 - bits); + + uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div; + if (le) { + x1 = xnmodp(4 * 128 - 32, poly_, 32, &div, le); + x2 = xnmodp(4 * 128 + 32, poly_, 32, &div, le); + x3 = xnmodp(128 - 32, poly_, 32, &div, le); + x4 = xnmodp(128 + 32, poly_, 32, &div, le); + x5 = x3; + x6 = xnmodp(64, poly_, 32, &div, le); + x7 = div; + x8 = reverse(poly_ | (1ULL << 32), 32); + } else { + x1 = xnmodp(4 * 128 + 64, poly_, 32, &div, le); + x2 = xnmodp(4 * 128, poly_, 32, &div, le); + x3 = xnmodp(128 + 64, poly_, 32, &div, le); + x4 = xnmodp(128, poly_, 32, &div, le); + x5 = xnmodp(64, poly_, 32, &div, le); + x7 = div; + x6 = xnmodp(96, poly_, 32, &div, le); + x8 = poly_ | (1ULL << 32); + } + ctx[0] = (AVCRC)x1; + ctx[1] = (AVCRC)(x1 >> 32); + ctx[2] = (AVCRC)x2; + ctx[3] = (AVCRC)(x2 >> 32); + ctx[4] = (AVCRC)x3; + ctx[5] = (AVCRC)(x3 >> 32); + ctx[6] = (AVCRC)x4; + ctx[7] = (AVCRC)(x4 >> 32); + ctx[8] = (AVCRC)x5; + ctx[9] = (AVCRC)(x5 >> 32); + ctx[10] = (AVCRC)x6; + ctx[11] = (AVCRC)(x6 >> 32); + ctx[12] = (AVCRC)x7; + ctx[13] = (AVCRC)(x7 >> 32); + ctx[14] = (AVCRC)x8; + ctx[15] = (AVCRC)(x8 >> 32); +} +#endif + +#endif /* AVUTIL_X86_CRC_H */ -- 2.49.1 >From f773d44580f79c8dba0e09e77aa74e4b9307bb95 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 7 Dec 2025 11:46:41 +0100 Subject: [PATCH 5/5] avutil/crc: Use x86 clmul for CRC when available Observed near 10x speedup on AMD Zen4 7950x: av_crc_c: 22057.0 ( 1.00x) av_crc_clmul: 2202.8 (10.01x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/crc.c | 24 ++++++++++++ libavutil/x86/Makefile | 1 + libavutil/x86/crc.asm | 11 +++--- libavutil/x86/crc.h | 88 ++++++++++++++++++++++++++++++++++-------- 4 files changed, 102 insertions(+), 22 deletions(-) diff --git a/libavutil/crc.c b/libavutil/crc.c index 703b56f4e0..be11516b84 100644 --- a/libavutil/crc.c +++ b/libavutil/crc.c @@ -25,6 +25,9 @@ #include "bswap.h" #include "crc.h" #include "error.h" +#if ARCH_X86 +#include "libavutil/x86/crc.h" +#endif #if CONFIG_HARDCODED_TABLES static const AVCRC av_crc_table[AV_CRC_MAX][257] = { @@ -348,6 +351,12 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) if (ctx_size != sizeof(AVCRC) * 257 && ctx_size != sizeof(AVCRC) * 1024) return AVERROR(EINVAL); +#if ARCH_X86 + int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size); + if (done) + return 0; +#endif + for (i = 0; i < 256; i++) { if (le) { for (c = i, j = 0; j < 8; j++) @@ -373,6 +382,14 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) const AVCRC *av_crc_get_table(AVCRCId crc_id) { +// Check for arch-specific extensions first to avoid initializing +// ordinary CRC tables unnecessarily. +#if ARCH_X86 + const AVCRC *table = ff_crc_get_table_x86(crc_id); + if (table) + return table; +#endif + #if !CONFIG_HARDCODED_TABLES switch (crc_id) { case AV_CRC_8_ATM: CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM); break; @@ -392,6 +409,13 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id) uint32_t av_crc(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length) { + if (ctx[0]) { +#if ARCH_X86 + return ff_crc_x86(ctx, crc, buffer, length); +#endif + } + av_assert2(ctx[0] == 0); + const uint8_t *end = buffer + length; #if !CONFIG_SMALL diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index 4e1b4b1176..901298b6cb 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -4,6 +4,7 @@ EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o X86ASM-OBJS += x86/aes.o x86/aes_init.o \ x86/cpuid.o \ + x86/crc.o \ $(EMMS_OBJS__yes_) \ x86/fixed_dsp.o x86/fixed_dsp_init.o \ x86/float_dsp.o x86/float_dsp_init.o \ diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm index fd06acfdd2..8f9b54544d 100644 --- a/libavutil/x86/crc.asm +++ b/libavutil/x86/crc.asm @@ -138,6 +138,7 @@ SECTION .text %endmacro %macro CRC 1 +%define CTX r0+4 ;----------------------------------------------------------------------------------------------- ; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length ;----------------------------------------------------------------------------------------------- @@ -177,7 +178,7 @@ cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 mov r4, 64 cmp r3, 128 jb .reduce_4x_to_1 - movu m4, [r0] + movu m4, [CTX] .fold_4x_loop: movu m6, [r2 + r4 + 0] @@ -200,7 +201,7 @@ cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 jbe .fold_4x_loop .reduce_4x_to_1: - movu m4, [r0 + 16] + movu m4, [CTX + 16] FOLD_SINGLE m5, m1, m4, m3 FOLD_SINGLE m5, m1, m4, m2 FOLD_SINGLE m5, m1, m4, m0 @@ -245,10 +246,10 @@ cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 FOLD_SINGLE m5, m1, m4, m2 .reduce_128_to_64: - movu m4, [r0 + 32] + movu m4, [CTX + 32] FOLD_128_TO_64 %1, m1, m4, m5 .reduce_64_to_32: - movu m4, [r0 + 48] + movu m4, [CTX + 48] FOLD_64_TO_32 %1, m1, m4, m5 RET @@ -261,7 +262,7 @@ cglobal crc, 4, 6, 6+4*ARCH_X86_64, 0x10 pshufb m1, m10 %endif mov r4, 16 - movu m4, [r0 + 16] + movu m4, [CTX + 16] jmp .fold_1x_pre .less_than_16bytes: diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h index 936ca54d37..a5799c5409 100644 --- a/libavutil/x86/crc.h +++ b/libavutil/x86/crc.h @@ -24,6 +24,7 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/attributes_internal.h" +#include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/crc.h" #include "libavutil/reverse.h" @@ -37,50 +38,64 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, size_t length); FF_VISIBILITY_POP_HIDDEN -static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = { +enum { + CRC_C = 0, + CLMUL_BE, + CLMUL_LE, +}; + +static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = { [AV_CRC_8_ATM] = { + CLMUL_BE, 0x32000000, 0x0, 0xbc000000, 0x0, 0xc4000000, 0x0, 0x94000000, 0x0, 0x62000000, 0x0, 0x79000000, 0x0, 0x07156a16, 0x1, 0x07000000, 0x1, }, [AV_CRC_8_EBU] = { + CLMUL_BE, 0xb5000000, 0x0, 0xf3000000, 0x0, 0xfc000000, 0x0, 0x0d000000, 0x0, 0x6a000000, 0x0, 0x65000000, 0x0, 0x1c4b8192, 0x1, 0x1d000000, 0x1, }, [AV_CRC_16_ANSI] = { + CLMUL_BE, 0xf9e30000, 0x0, 0x807d0000, 0x0, 0xf9130000, 0x0, 0xff830000, 0x0, 0x807b0000, 0x0, 0x86630000, 0x0, 0xfffbffe7, 0x1, 0x80050000, 0x1, }, [AV_CRC_16_CCITT] = { + CLMUL_BE, 0x60190000, 0x0, 0x59b00000, 0x0, 0xd5f60000, 0x0, 0x45630000, 0x0, 0xaa510000, 0x0, 0xeb230000, 0x0, 0x11303471, 0x1, 0x10210000, 0x1, }, [AV_CRC_24_IEEE] = { + CLMUL_BE, 0x1f428700, 0x0, 0x467d2400, 0x0, 0x2c8c9d00, 0x0, 0x64e4d700, 0x0, 0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0, 0xf845fe24, 0x1, 0x864cfb00, 0x1, }, [AV_CRC_32_IEEE] = { + CLMUL_BE, 0x8833794c, 0x0, 0xe6228b11, 0x0, 0xc5b9cd4c, 0x0, 0xe8a45605, 0x0, 0x490d678d, 0x0, 0xf200aa66, 0x0, 0x04d101df, 0x1, 0x04c11db7, 0x1, }, [AV_CRC_32_IEEE_LE] = { + CLMUL_LE, 0xc6e41596, 0x1, 0x54442bd4, 0x1, 0xccaa009e, 0x0, 0x751997d0, 0x1, 0xccaa009e, 0x0, 0x63cd6124, 0x1, 0xf7011640, 0x1, 0xdb710641, 0x1, }, [AV_CRC_16_ANSI_LE] = { + CLMUL_LE, 0x0000bffa, 0x0, 0x1b0c2, 0x0, 0x00018cc2, 0x0, 0x1d0c2, 0x0, 0x00018cc2, 0x0, 0x1bc02, 0x0, @@ -158,23 +173,62 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int x6 = xnmodp(96, poly_, 32, &div, le); x8 = poly_ | (1ULL << 32); } - ctx[0] = (AVCRC)x1; - ctx[1] = (AVCRC)(x1 >> 32); - ctx[2] = (AVCRC)x2; - ctx[3] = (AVCRC)(x2 >> 32); - ctx[4] = (AVCRC)x3; - ctx[5] = (AVCRC)(x3 >> 32); - ctx[6] = (AVCRC)x4; - ctx[7] = (AVCRC)(x4 >> 32); - ctx[8] = (AVCRC)x5; - ctx[9] = (AVCRC)(x5 >> 32); - ctx[10] = (AVCRC)x6; - ctx[11] = (AVCRC)(x6 >> 32); - ctx[12] = (AVCRC)x7; - ctx[13] = (AVCRC)(x7 >> 32); - ctx[14] = (AVCRC)x8; - ctx[15] = (AVCRC)(x8 >> 32); + ctx[0] = le ? CLMUL_LE : CLMUL_BE; + ctx[1] = (AVCRC)x1; + ctx[2] = (AVCRC)(x1 >> 32); + ctx[3] = (AVCRC)x2; + ctx[4] = (AVCRC)(x2 >> 32); + ctx[5] = (AVCRC)x3; + ctx[6] = (AVCRC)(x3 >> 32); + ctx[7] = (AVCRC)x4; + ctx[8] = (AVCRC)(x4 >> 32); + ctx[9] = (AVCRC)x5; + ctx[10] = (AVCRC)(x5 >> 32); + ctx[11] = (AVCRC)x6; + ctx[12] = (AVCRC)(x6 >> 32); + ctx[13] = (AVCRC)x7; + ctx[14] = (AVCRC)(x7 >> 32); + ctx[15] = (AVCRC)x8; + ctx[16] = (AVCRC)(x8 >> 32); } #endif +static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id) +{ +#if HAVE_CLMUL_EXTERNAL + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_CLMUL(cpu_flags)) { + return crc_table_clmul[crc_id]; + } +#endif + return NULL; +} + +static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ +#if HAVE_CLMUL_EXTERNAL + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_CLMUL(cpu_flags)) { + crc_init_x86(ctx, le, bits, poly, ctx_size); + return 1; + } +#endif + return 0; +} + +static inline uint32_t ff_crc_x86(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length) +{ + switch (ctx[0]) { +#if HAVE_CLMUL_EXTERNAL + case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length); + case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length); +#endif + default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE"); + } + return 0; +} + #endif /* AVUTIL_X86_CRC_H */ -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
