[FFmpeg-devel] [PATCH] avutil/crc: add x86 SSE4.2 clmul (PR #21119)

mkver via ffmpeg-devel Sun, 07 Dec 2025 03:19:59 -0800

PR #21119 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21119.patch


This adds the clmul implementations from #20751 without the avpriv function 
pointer indirection. It also adds a checkasm test for this.

I can also add the AVX512 implementation from said PR if people agree on this 
approach.


>From def67a6565588d8554c1cecb0d2c0ac3d5ffa011 Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Sun, 26 Oct 2025 10:45:55 +0530
Subject: [PATCH 1/5] avutil/cpu: add x86 CPU feature flag for clmul

---
 configure                 | 4 ++++
 libavutil/cpu.c           | 1 +
 libavutil/cpu.h           | 1 +
 libavutil/tests/cpu.c     | 1 +
 libavutil/x86/cpu.c       | 2 ++
 libavutil/x86/cpu.h       | 2 ++
 tests/checkasm/checkasm.c | 1 +
 7 files changed, 12 insertions(+)

diff --git a/configure b/configure
index 04e086c32a..abdd129fa2 100755
--- a/configure
+++ b/configure
@@ -469,6 +469,7 @@ Optimization options (experts only):
   --disable-avx512         disable AVX-512 optimizations
   --disable-avx512icl      disable AVX-512ICL optimizations
   --disable-aesni          disable AESNI optimizations
+  --disable-clmul          disable CLMUL optimizations
   --disable-armv5te        disable armv5te optimizations
   --disable-armv6          disable armv6 optimizations
   --disable-armv6t2        disable armv6t2 optimizations
@@ -2252,6 +2253,7 @@ ARCH_EXT_LIST_WASM="
 
 ARCH_EXT_LIST_X86_SIMD="
     aesni
+    clmul
     amd3dnow
     amd3dnowext
     avx
@@ -2870,6 +2872,7 @@ ssse3_deps="sse3"
 sse4_deps="ssse3"
 sse42_deps="sse4"
 aesni_deps="sse42"
+clmul_deps="sse42"
 avx_deps="sse42"
 xop_deps="avx"
 fma3_deps="avx"
@@ -8191,6 +8194,7 @@ if enabled x86; then
     echo "SSE enabled               ${sse-no}"
     echo "SSSE3 enabled             ${ssse3-no}"
     echo "AESNI enabled             ${aesni-no}"
+    echo "CLMUL enabled             ${clmul-no}"
     echo "AVX enabled               ${avx-no}"
     echo "AVX2 enabled              ${avx2-no}"
     echo "AVX-512 enabled           ${avx512-no}"
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 8f9b785ebc..0ddbc50da5 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -149,6 +149,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "3dnowext", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 
AV_CPU_FLAG_3DNOWEXT },    .unit = "flags" },
         { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CMOV    
 },    .unit = "flags" },
         { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AESNI   
 },    .unit = "flags" },
+        { "clmul",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CLMUL   
 },    .unit = "flags" },
         { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512  
 },    .unit = "flags" },
         { "avx512icl",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 
AV_CPU_FLAG_AVX512ICL   }, .unit = "flags" },
         { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 
AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index a06fc08e56..600754867a 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -47,6 +47,7 @@
 #define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
 #define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
 #define AV_CPU_FLAG_AESNI       0x80000 ///< Advanced Encryption Standard 
functions
+#define AV_CPU_FLAG_CLMUL      0x400000 ///< Carry-less Multiplication 
instruction
 #define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS 
support even if YMM registers aren't used
 #define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when 
using YMM registers (e.g. Bulldozer)
 #define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
index fd2e32901d..6f8a0be2c3 100644
--- a/libavutil/tests/cpu.c
+++ b/libavutil/tests/cpu.c
@@ -88,6 +88,7 @@ static const struct {
     { AV_CPU_FLAG_BMI1,      "bmi1"       },
     { AV_CPU_FLAG_BMI2,      "bmi2"       },
     { AV_CPU_FLAG_AESNI,     "aesni"      },
+    { AV_CPU_FLAG_CLMUL,     "clmul"      },
     { AV_CPU_FLAG_AVX512,    "avx512"     },
     { AV_CPU_FLAG_AVX512ICL, "avx512icl"  },
     { AV_CPU_FLAG_SLOW_GATHER, "slowgather" },
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 1a592f3bf4..5563f6cc3b 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -121,6 +121,8 @@ int ff_get_cpu_flags_x86(void)
             rval |= AV_CPU_FLAG_SSE2;
         if (ecx & 1)
             rval |= AV_CPU_FLAG_SSE3;
+        if (ecx & 0x2)
+            rval |= AV_CPU_FLAG_CLMUL;
         if (ecx & 0x00000200 )
             rval |= AV_CPU_FLAG_SSSE3;
         if (ecx & 0x00080000 )
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 00e82255b1..af081b2ed8 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -44,6 +44,7 @@
 #define X86_FMA4(flags)             CPUEXT(flags, FMA4)
 #define X86_AVX2(flags)             CPUEXT(flags, AVX2)
 #define X86_AESNI(flags)            CPUEXT(flags, AESNI)
+#define X86_CLMUL(flags)            CPUEXT(flags, CLMUL)
 #define X86_AVX512(flags)           CPUEXT(flags, AVX512)
 
 #define EXTERNAL_MMX(flags)         CPUEXT_SUFFIX(flags, _EXTERNAL, MMX)
@@ -72,6 +73,7 @@
 #define EXTERNAL_AVX2_FAST(flags)   CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, 
AVX2, AVX)
 #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, 
AVX2, AVX)
 #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
+#define EXTERNAL_CLMUL(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, CLMUL)
 #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
 #define EXTERNAL_AVX512ICL(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512ICL)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 7edc8e4e6e..80b08a2532 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -395,6 +395,7 @@ static const struct {
     { "SSE4.1",     "sse4",      AV_CPU_FLAG_SSE4 },
     { "SSE4.2",     "sse42",     AV_CPU_FLAG_SSE42 },
     { "AES-NI",     "aesni",     AV_CPU_FLAG_AESNI },
+    { "CLMUL",      "clmul",     AV_CPU_FLAG_CLMUL },
     { "AVX",        "avx",       AV_CPU_FLAG_AVX },
     { "XOP",        "xop",       AV_CPU_FLAG_XOP },
     { "FMA3",       "fma3",      AV_CPU_FLAG_FMA3 },
-- 
2.49.1


>From cf85126b10fc5658f11af9b363919c5d55ec0054 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 7 Dec 2025 10:38:05 +0100
Subject: [PATCH 2/5] tests/checkasm: Add support for using opaques to decide
 whether to test

This is in preparation for adding checkasm support for av_crc(),
which will always call the same function, but uses different CRC
tables to distinguish different implementations.
This reuses checkasm_check_func() for this; one could also add
a new function or use unions. This would allow to avoid casting
const away in the crc test to be added. It would also allow
to avoid converting function pointers to void* (which ISO C
does not allow).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/checkasm.h | 44 +++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9f4fb8b283..d1d524c730 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -199,6 +199,7 @@ extern uint64_t bench_runs;
 
 /* Decide whether or not the specified function needs to be tested */
 #define check_func(func, ...) (checkasm_save_context(), func_ref = 
checkasm_check_func((func_new = func), __VA_ARGS__))
+#define check_opaque(opaque, ...) (checkasm_save_context(), 
checkasm_check_func(opaque, __VA_ARGS__))
 
 /* Declare the function prototype. The first argument is the return value, the 
remaining
  * arguments are the function parameters. Naming parameters is optional. */
@@ -214,10 +215,15 @@ extern uint64_t bench_runs;
 
 /* Call the reference function */
 #define call_ref(...)\
+    call_ref_ext((func_type *)func_ref, __VA_ARGS__)
+
+#define call_ref_ext(func, ...) \
     (checkasm_set_signal_handler_state(1),\
-     ((func_type *)func_ref)(__VA_ARGS__));\
+     (func)(__VA_ARGS__));\
     checkasm_set_signal_handler_state(0)
 
+#define call_new(...) call_new_ext(((func_type *)func_new), __VA_ARGS__)
+
 #if ARCH_X86 && HAVE_X86ASM
 /* Verifies that clobbered callee-saved registers are properly saved and 
restored
  * and that either no MMX registers are touched or emms is issued */
@@ -249,10 +255,10 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
         ((cpu_flags) & av_get_cpu_flags()) ? (void 
*)checkasm_checked_call_emms : \
                                              (void *)checkasm_checked_call;
 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
-#define call_new(...) (checkasm_set_signal_handler_state(1),\
+#define call_new_ext(func, ...) (checkasm_set_signal_handler_state(1),\
                        
checkasm_stack_clobber(CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,\
                                               
CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB),\
-                       checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__));\
+                       checked_call((func), 0, 0, 0, 0, 0, __VA_ARGS__));\
                       checkasm_set_signal_handler_state(0)
 #elif ARCH_X86_32
 #define declare_new(ret, ...) ret (*checked_call)(void *, __VA_ARGS__) = (void 
*)checkasm_checked_call;
@@ -260,9 +266,9 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
 #define declare_new_emms(cpu_flags, ret, ...) ret (*checked_call)(void *, 
__VA_ARGS__) = \
         ((cpu_flags) & av_get_cpu_flags()) ? (void 
*)checkasm_checked_call_emms :        \
                                              (void *)checkasm_checked_call;
-#define call_new(...)\
+#define call_new_ext(func, ...)\
     (checkasm_set_signal_handler_state(1),\
-     checked_call(func_new, __VA_ARGS__));\
+     checked_call((func), __VA_ARGS__));\
     checkasm_set_signal_handler_state(0)
 #endif
 #elif ARCH_ARM && HAVE_ARMV5TE_EXTERNAL
@@ -275,9 +281,9 @@ extern void (*checkasm_checked_call)(void *func, int dummy, 
...);
 #define declare_new(ret, ...) ret (*checked_call)(void *, int dummy, 
__VA_ARGS__, \
                                                   int, int, int, int, int, 
int, int, int, \
                                                   int, int, int, int, int, 
int, int) = (void *)checkasm_checked_call;
-#define call_new(...) \
+#define call_new_ext(func, ...) \
     (checkasm_set_signal_handler_state(1),\
-     checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 
0, 0, 0, 0));\
+     checked_call((func), 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 
0, 0, 0, 0));\
     checkasm_set_signal_handler_state(0)
 #elif ARCH_AARCH64 && !defined(__APPLE__)
 void checkasm_stack_clobber(uint64_t clobber, ...);
@@ -287,10 +293,10 @@ void checkasm_checked_call(void *func, ...);
                                                   int, int, int, int, int, 
int, int)\
                               = (void *)checkasm_checked_call;
 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
-#define call_new(...) (checkasm_set_signal_handler_state(1),\
+#define call_new_ext(func, ...) (checkasm_set_signal_handler_state(1),\
                        
checkasm_stack_clobber(CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,\
                                               
CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB,CLOB),\
-                      checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
+                      checked_call((func), 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
                                    7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 
0));\
                      checkasm_set_signal_handler_state(0)
 #elif ARCH_RISCV
@@ -300,15 +306,15 @@ void *checkasm_get_wrapper(void);
 #if HAVE_RV && (__riscv_xlen == 64) && defined (__riscv_d)
 #define declare_new(ret, ...) \
     ret (*checked_call)(__VA_ARGS__) = checkasm_get_wrapper();
-#define call_new(...) \
+#define call_new_ext(func, ...) \
     (checkasm_set_signal_handler_state(1),\
-     checkasm_set_function(func_new), checked_call(__VA_ARGS__));\
+     checkasm_set_function(func), checked_call(__VA_ARGS__));\
     checkasm_set_signal_handler_state(0)
 #else
 #define declare_new(ret, ...)
-#define call_new(...)\
+#define call_new_ext(func, ...)\
     (checkasm_set_signal_handler_state(1),\
-     ((func_type *)func_new)(__VA_ARGS__));\
+     (func)(__VA_ARGS__));\
     checkasm_set_signal_handler_state(0)
 #endif
 #else
@@ -316,9 +322,9 @@ void *checkasm_get_wrapper(void);
 #define declare_new_float(ret, ...)
 #define declare_new_emms(cpu_flags, ret, ...)
 /* Call the function */
-#define call_new(...)\
+#define call_new_ext(func, ...)\
     (checkasm_set_signal_handler_state(1),\
-     ((func_type *)func_new)(__VA_ARGS__));\
+     (func)(__VA_ARGS__));\
     checkasm_set_signal_handler_state(0)
 #endif
 
@@ -373,12 +379,12 @@ typedef struct CheckasmPerf {
     } while (0)
 
 /* Benchmark the function */
-#define bench_new(...)\
+#define bench(func, ...)\
     do {\
         if (checkasm_bench_func()) {\
             struct CheckasmPerf *perf = checkasm_get_perf_context();\
             av_unused const int sysfd = perf->sysfd;\
-            func_type *tfunc = func_new;\
+            func_type *tfunc = func;\
             uint64_t tsum = 0;\
             uint64_t ti, tcount = 0;\
             uint64_t t = 0; \
@@ -401,11 +407,13 @@ typedef struct CheckasmPerf {
         }\
     } while (0)
 #else
-#define bench_new(...) while(0)
+#define bench(func, ...) while(0)
 #define PERF_START(t)  while(0)
 #define PERF_STOP(t)   while(0)
 #endif
 
+#define bench_new(...) bench(func_new, __VA_ARGS__)
+
 #define BUF_RECT(type, name, w, h) \
     LOCAL_ALIGNED_32(type, name##_buf, [((h)+32)*(FFALIGN(w,64)+64) + 64]); \
     av_unused ptrdiff_t name##_stride = sizeof(type)*(FFALIGN(w,64)+64); \
-- 
2.49.1


>From 814b28d8adaf3e3db33464dd2f15fea1b494df92 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 7 Dec 2025 10:50:39 +0100
Subject: [PATCH 3/5] tests/checkasm: Add CRC test

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  1 +
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/crc.c      | 68 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 72 insertions(+)
 create mode 100644 tests/checkasm/crc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 1c34619249..014fccc1df 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -91,6 +91,7 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 # libavutil tests
 AVUTILOBJS                              += aes.o
 AVUTILOBJS                              += av_tx.o
+AVUTILOBJS                              += crc.o
 AVUTILOBJS                              += fixed_dsp.o
 AVUTILOBJS                              += float_dsp.o
 AVUTILOBJS                              += lls.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 80b08a2532..1147d82903 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -337,6 +337,7 @@ static const struct {
 #endif
 #if CONFIG_AVUTIL
         { "aes",       checkasm_check_aes },
+        { "crc",       checkasm_check_crc },
         { "fixed_dsp", checkasm_check_fixed_dsp },
         { "float_dsp", checkasm_check_float_dsp },
         { "lls",       checkasm_check_lls },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index d1d524c730..24ba15493b 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -93,6 +93,7 @@ void checkasm_check_bswapdsp(void);
 void checkasm_check_cavsdsp(void);
 void checkasm_check_colordetect(void);
 void checkasm_check_colorspace(void);
+void checkasm_check_crc(void);
 void checkasm_check_dcadsp(void);
 void checkasm_check_diracdsp(void);
 void checkasm_check_exrdsp(void);
diff --git a/tests/checkasm/crc.c b/tests/checkasm/crc.c
new file mode 100644
index 0000000000..d5380ee4da
--- /dev/null
+++ b/tests/checkasm/crc.c
@@ -0,0 +1,68 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#include "checkasm.h"
+#include "libavutil/attributes.h"
+// Undefine av_pure so that calls to av_crc are not optimized away.
+#undef av_pure
+#define av_pure
+#include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+
+void checkasm_check_crc(void)
+{
+    declare_func(uint32_t, const AVCRC *ctx, uint32_t crc,
+                 const uint8_t *buffer, size_t length);
+
+    for (unsigned i = 0; i < AV_CRC_MAX; ++i) {
+        const AVCRC *table_new = av_crc_get_table(i);
+        const AVCRC *table_ref;
+
+        if (table_ref = check_opaque((AVCRC*)table_new, "crc_%u", i)) {
+            DECLARE_ALIGNED(4, uint8_t, buf)[8192];
+            size_t offset = rnd() & 31;
+            static size_t sizes[AV_CRC_MAX];
+            static unsigned sizes_initialized = 0;
+            uint32_t prev_crc = rnd();
+
+            if (!(sizes_initialized & (1 << i))) {
+                sizes_initialized |= 1 << i;
+                sizes[i] = rnd() % (sizeof(buf) - 1 - offset);
+            }
+
+            size_t size = sizes[i];
+
+            for (size_t j = 0; j < sizeof(buf); j += 4)
+                AV_WN32A(buf + j, rnd());
+
+            uint32_t crc_ref = call_ref_ext(av_crc, table_ref, prev_crc, buf + 
offset, size);
+            uint32_t crc_new = call_new_ext(av_crc, table_new, prev_crc, buf + 
offset, size);
+
+            if (crc_ref != crc_new)
+                fail();
+
+            bench(av_crc, table_new, prev_crc, buf + offset, size);
+        }
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index f26e534591..3e97cd43c2 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -10,6 +10,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-blockdsp                                  \
                 fate-checkasm-bswapdsp                                  \
                 fate-checkasm-cavsdsp                                   \
+                fate-checkasm-crc                                       \
                 fate-checkasm-dcadsp                                    \
                 fate-checkasm-diracdsp                                  \
                 fate-checkasm-exrdsp                                    \
-- 
2.49.1


>From 42276817396f2a6c167d6d1e3a96e6d6921ecfc1 Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Sun, 26 Oct 2025 16:07:17 +0530
Subject: [PATCH 4/5] avutil/crc: add x86 SSE4.2 clmul SIMD implementation for
 av_crc

Implemented the algorithm described in the paper titled
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
by Intel.
It is not used yet; the integration will be added in a separate commit.

Observed near 10x speedup on AMD Zen4 7950x:
av_crc_c:                                            22057.0 ( 1.00x)
av_crc_clmul:                                         2202.8 (10.01x)
---
 libavutil/x86/crc.asm | 297 ++++++++++++++++++++++++++++++++++++++++++
 libavutil/x86/crc.h   | 180 +++++++++++++++++++++++++
 2 files changed, 477 insertions(+)
 create mode 100644 libavutil/x86/crc.asm
 create mode 100644 libavutil/x86/crc.h

diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm
new file mode 100644
index 0000000000..fd06acfdd2
--- /dev/null
+++ b/libavutil/x86/crc.asm
@@ -0,0 +1,297 @@
+;*****************************************************************************
+;* Copyright (c) 2025 Shreesh Adiga <[email protected]>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION RODATA
+reverse_shuffle: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+partial_bytes_shuf_tab: db 255, 254, 253, 252, 251, 250, 249, 248,\
+                           247, 246, 245, 244, 243, 242, 241, 240,\
+                             0,   1,   2,   3,   4,   5,   6,   7,\
+                             8,   9,  10,  11,  12,  13,  14,  15
+
+SECTION .text
+
+%macro FOLD_128_TO_64 4
+; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
+%if %1 == 1
+    mova      %4, %2
+    pclmulqdq %2, %3, 0x00
+    psrldq    %4, 8
+    pxor      %2, %4
+    mova      %4, %2
+    psllq     %4, 32
+    pclmulqdq %4, %3, 0x10
+    pxor      %2, %4
+%else
+    movq      %4, %2
+    pclmulqdq %2, %3, 0x11
+    pslldq    %4, 4
+    pxor      %4, %2
+    mova      %2, %4
+    pclmulqdq %4, %3, 0x01
+    pxor      %2, %4
+%endif
+%endmacro
+
+%macro FOLD_64_TO_32 4
+; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
+%if %1 == 1
+    pxor      %4, %4
+    pblendw   %4, %2, 0xfc
+    mova      %2, %4
+    pclmulqdq %4, %3, 0x00
+    pxor      %4, %2
+    pclmulqdq %4, %3, 0x10
+    pxor      %2, %4
+    pextrd   eax, %2, 2
+%else
+    mova      %4, %2
+    pclmulqdq %2, %3, 0x00
+    pclmulqdq %2, %3, 0x11
+    pxor      %2, %4
+    movd     eax, %2
+    bswap    eax
+%endif
+%endmacro
+
+%macro FOLD_SINGLE 4
+; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block
+    mova      %1, %2
+    pclmulqdq %1, %3, 0x01
+    pxor      %1, %4
+    pclmulqdq %2, %3, 0x10
+    pxor      %2, %1
+%endmacro
+
+%macro XMM_SHIFT_LEFT 4
+; %1 xmm input reg ; %2 shift bytes amount ; %3 temp xmm register ; %4 temp gpr
+    lea    %4, [partial_bytes_shuf_tab]
+    movu   %3, [%4 + 16 - (%2)]
+    pshufb %1, %3
+%endmacro
+
+%macro MEMCPY_0_15 6
+; %1 dst ; %2 src ; %3 len ; %4, %5 temp gpr register; %6 done label
+    cmp %3, 8
+    jae .between_8_15
+    cmp %3, 4
+    jae .between_4_7
+    cmp %3, 1
+    ja .between_2_3
+    jb %6
+    mov  %4b, [%2]
+    mov [%1], %4b
+    jmp %6
+
+.between_8_15:
+%if ARCH_X86_64
+    mov           %4q, [%2]
+    mov           %5q, [%2 + %3 - 8]
+    mov          [%1], %4q
+    mov [%1 + %3 - 8], %5q
+    jmp %6
+%else
+    xor            %5, %5
+.copy4b:
+        mov       %4d, [%2 + %5]
+        mov [%1 + %5], %4d
+        add        %5, 4
+        lea        %4, [%5 + 4]
+        cmp        %4, %3
+        jb        .copy4b
+
+    mov           %4d, [%2 + %3 - 4]
+    mov [%1 + %3 - 4], %4d
+    jmp %6
+%endif
+.between_4_7:
+    mov           %4d, [%2]
+    mov           %5d, [%2 + %3 - 4]
+    mov          [%1], %4d
+    mov [%1 + %3 - 4], %5d
+    jmp %6
+.between_2_3:
+    mov           %4w, [%2]
+    mov           %5w, [%2 + %3 - 2]
+    mov          [%1], %4w
+    mov [%1 + %3 - 2], %5w
+    ; fall through, %6 label is expected to be next instruction
+%endmacro
+
+%macro CRC 1
+;-----------------------------------------------------------------------------------------------
+; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, 
size_t length
+;-----------------------------------------------------------------------------------------------
+; %1 == 1 - LE format
+%if %1 == 1
+cglobal crc_le, 4, 6, 7+4*ARCH_X86_64, 0x10
+%else
+cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
+%endif
+
+%if ARCH_X86_32
+    %define m10 m6
+%endif
+
+%if %1 == 0
+    movu  m10, [reverse_shuffle]
+%endif
+
+    movd   m4, r1d
+%if ARCH_X86_32
+    ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32
+    jmp   .less_than_64bytes
+%else
+    cmp    r3, 64
+    jb    .less_than_64bytes
+    movu   m1, [r2 +  0]
+    movu   m3, [r2 + 16]
+    movu   m2, [r2 + 32]
+    movu   m0, [r2 + 48]
+    pxor   m1, m4
+%if %1 == 0
+    pshufb m0, m10
+    pshufb m1, m10
+    pshufb m2, m10
+    pshufb m3, m10
+%endif
+    mov    r4, 64
+    cmp    r3, 128
+    jb    .reduce_4x_to_1
+    movu   m4, [r0]
+
+.fold_4x_loop:
+        movu        m6, [r2 + r4 +  0]
+        movu        m7, [r2 + r4 + 16]
+        movu        m8, [r2 + r4 + 32]
+        movu        m9, [r2 + r4 + 48]
+%if %1 == 0
+        pshufb      m6, m10
+        pshufb      m7, m10
+        pshufb      m8, m10
+        pshufb      m9, m10
+%endif
+        FOLD_SINGLE m5, m1, m4, m6
+        FOLD_SINGLE m5, m3, m4, m7
+        FOLD_SINGLE m5, m2, m4, m8
+        FOLD_SINGLE m5, m0, m4, m9
+        add         r4, 64
+        lea         r5, [r4 + 64]
+        cmp         r5, r3
+        jbe        .fold_4x_loop
+
+.reduce_4x_to_1:
+    movu        m4, [r0 + 16]
+    FOLD_SINGLE m5, m1, m4, m3
+    FOLD_SINGLE m5, m1, m4, m2
+    FOLD_SINGLE m5, m1, m4, m0
+%endif
+
+.fold_1x_pre:
+    lea  r5, [r4 + 16]
+    cmp  r5, r3
+    ja  .partial_block
+
+.fold_1x_loop:
+        movu        m2, [r2 + r4]
+%if %1 == 0
+        pshufb      m2, m10
+%endif
+        FOLD_SINGLE m5, m1, m4, m2
+        add         r4, 16
+        lea         r5, [r4 + 16]
+        cmp         r5, r3
+        jbe        .fold_1x_loop
+
+.partial_block:
+    cmp         r4, r3
+    jae        .reduce_128_to_64
+    movu        m2, [r2 + r3 - 16]
+    and         r3, 0xf
+    lea         r4, [partial_bytes_shuf_tab]
+    movu        m0, [r3 + r4]
+%if %1 == 0
+    pshufb      m1, m10
+%endif
+    mova        m3, m1
+    pcmpeqd     m5, m5 ; m5 = _mm_set1_epi8(0xff)
+    pxor        m5, m0
+    pshufb      m3, m5
+    pblendvb    m2, m3, m0
+    pshufb      m1, m0
+%if %1 == 0
+    pshufb      m1, m10
+    pshufb      m2, m10
+%endif
+    FOLD_SINGLE m5, m1, m4, m2
+
+.reduce_128_to_64:
+    movu           m4, [r0 + 32]
+    FOLD_128_TO_64 %1, m1, m4, m5
+.reduce_64_to_32:
+    movu           m4, [r0 + 48]
+    FOLD_64_TO_32  %1, m1, m4, m5
+    RET
+
+.less_than_64bytes:
+    cmp    r3, 16
+    jb    .less_than_16bytes
+    movu   m1, [r2]
+    pxor   m1, m4
+%if %1 == 0
+    pshufb m1, m10
+%endif
+    mov    r4, 16
+    movu   m4, [r0 + 16]
+    jmp   .fold_1x_pre
+
+.less_than_16bytes:
+    pxor           m1, m1
+    movu        [rsp], m1
+    MEMCPY_0_15   rsp, r2, r3, r1, r4, .memcpy_done
+
+.memcpy_done:
+    movu           m1, [rsp]
+    pxor           m1, m4
+    cmp            r3, 5
+    jb            .less_than_5bytes
+    XMM_SHIFT_LEFT m1, (16 - r3), m2, r4
+%if %1 == 0
+    pshufb         m1, m10
+%endif
+    jmp           .reduce_128_to_64
+
+.less_than_5bytes:
+%if %1 == 0
+    XMM_SHIFT_LEFT m1, (4 - r3), m2, r4
+    movq          m10, [reverse_shuffle + 8] ; 0x0001020304050607
+    pshufb         m1, m10
+%else
+    XMM_SHIFT_LEFT m1, (8 - r3), m2, r4
+%endif
+    jmp .reduce_64_to_32
+
+%endmacro
+
+INIT_XMM clmul
+CRC 0
+CRC 1
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
new file mode 100644
index 0000000000..936ca54d37
--- /dev/null
+++ b/libavutil/x86/crc.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2025 Shreesh Adiga <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_CRC_H
+#define AVUTIL_X86_CRC_H
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/attributes_internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/crc.h"
+#include "libavutil/reverse.h"
+#include "libavutil/x86/cpu.h"
+
+#if HAVE_CLMUL_EXTERNAL
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc_clmul(const AVCRC *ctx, uint32_t crc,
+                      const uint8_t *buffer, size_t length);
+uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
+                         const uint8_t *buffer, size_t length);
+FF_VISIBILITY_POP_HIDDEN
+
+static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = {
+    [AV_CRC_8_ATM] = {
+        0x32000000, 0x0, 0xbc000000, 0x0,
+        0xc4000000, 0x0, 0x94000000, 0x0,
+        0x62000000, 0x0, 0x79000000, 0x0,
+        0x07156a16, 0x1, 0x07000000, 0x1,
+    },
+    [AV_CRC_8_EBU] = {
+        0xb5000000, 0x0, 0xf3000000, 0x0,
+        0xfc000000, 0x0, 0x0d000000, 0x0,
+        0x6a000000, 0x0, 0x65000000, 0x0,
+        0x1c4b8192, 0x1, 0x1d000000, 0x1,
+    },
+    [AV_CRC_16_ANSI] = {
+        0xf9e30000, 0x0, 0x807d0000, 0x0,
+        0xf9130000, 0x0, 0xff830000, 0x0,
+        0x807b0000, 0x0, 0x86630000, 0x0,
+        0xfffbffe7, 0x1, 0x80050000, 0x1,
+    },
+    [AV_CRC_16_CCITT] = {
+        0x60190000, 0x0, 0x59b00000, 0x0,
+        0xd5f60000, 0x0, 0x45630000, 0x0,
+        0xaa510000, 0x0, 0xeb230000, 0x0,
+        0x11303471, 0x1, 0x10210000, 0x1,
+    },
+    [AV_CRC_24_IEEE] = {
+        0x1f428700, 0x0, 0x467d2400, 0x0,
+        0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
+        0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
+        0xf845fe24, 0x1, 0x864cfb00, 0x1,
+    },
+    [AV_CRC_32_IEEE] = {
+        0x8833794c, 0x0, 0xe6228b11, 0x0,
+        0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
+        0x490d678d, 0x0, 0xf200aa66, 0x0,
+        0x04d101df, 0x1, 0x04c11db7, 0x1,
+    },
+    [AV_CRC_32_IEEE_LE] = {
+        0xc6e41596, 0x1, 0x54442bd4, 0x1,
+        0xccaa009e, 0x0, 0x751997d0, 0x1,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+    },
+    [AV_CRC_16_ANSI_LE] = {
+        0x0000bffa, 0x0, 0x1b0c2, 0x0,
+        0x00018cc2, 0x0, 0x1d0c2, 0x0,
+        0x00018cc2, 0x0, 0x1bc02, 0x0,
+        0xcfffbffe, 0x1, 0x14003, 0x0,
+    },
+};
+
+static uint64_t reverse(uint64_t p, unsigned int deg) {
+    uint64_t ret = 0;
+    int i;
+    for (i = 0; i < (deg / 8); i += 1) {
+        ret = (ret << 8) | (ff_reverse[p & 0xff]);
+        p >>= 8;
+    }
+    int rem = (deg + 1) - 8 * i;
+    ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem));
+    return ret;
+}
+
+static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, 
int bitreverse)
+{
+    uint64_t mod, mask, high;
+
+    if (n < deg) {
+        *div = 0;
+        return poly;
+    }
+    mask = ((uint64_t)1 << deg) - 1;
+    poly &= mask;
+    mod = poly;
+    *div = 1;
+    deg--;
+    while (--n > deg) {
+        high = (mod >> deg) & 1;
+        *div = (*div << 1) | high;
+        mod <<= 1;
+        if (high)
+            mod ^= poly;
+    }
+    uint64_t ret = mod & mask;
+    if (bitreverse) {
+        *div = reverse(*div, deg) << 1;
+        return reverse(ret, deg) << 1;
+    }
+    return ret;
+}
+
+static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, 
int ctx_size)
+{
+    uint64_t poly_;
+    if (le) {
+        // convert the reversed representation to regular form
+        poly = reverse(poly, bits) >> 1;
+    }
+    // convert to 32 degree polynomial
+    poly_ = ((uint64_t)poly) << (32 - bits);
+
+    uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div;
+    if (le) {
+        x1 = xnmodp(4 * 128 - 32, poly_, 32, &div, le);
+        x2 = xnmodp(4 * 128 + 32, poly_, 32, &div, le);
+        x3 = xnmodp(128 - 32, poly_, 32, &div, le);
+        x4 = xnmodp(128 + 32, poly_, 32, &div, le);
+        x5 = x3;
+        x6 = xnmodp(64, poly_, 32, &div, le);
+        x7 = div;
+        x8 = reverse(poly_ | (1ULL << 32), 32);
+    } else {
+        x1 = xnmodp(4 * 128 + 64, poly_, 32, &div, le);
+        x2 = xnmodp(4 * 128, poly_, 32, &div, le);
+        x3 = xnmodp(128 + 64, poly_, 32, &div, le);
+        x4 = xnmodp(128, poly_, 32, &div, le);
+        x5 = xnmodp(64, poly_, 32, &div, le);
+        x7 = div;
+        x6 = xnmodp(96, poly_, 32, &div, le);
+        x8 = poly_ | (1ULL << 32);
+    }
+    ctx[0]  = (AVCRC)x1;
+    ctx[1]  = (AVCRC)(x1 >> 32);
+    ctx[2]  = (AVCRC)x2;
+    ctx[3]  = (AVCRC)(x2 >> 32);
+    ctx[4]  = (AVCRC)x3;
+    ctx[5]  = (AVCRC)(x3 >> 32);
+    ctx[6]  = (AVCRC)x4;
+    ctx[7]  = (AVCRC)(x4 >> 32);
+    ctx[8]  = (AVCRC)x5;
+    ctx[9]  = (AVCRC)(x5 >> 32);
+    ctx[10] = (AVCRC)x6;
+    ctx[11] = (AVCRC)(x6 >> 32);
+    ctx[12] = (AVCRC)x7;
+    ctx[13] = (AVCRC)(x7 >> 32);
+    ctx[14] = (AVCRC)x8;
+    ctx[15] = (AVCRC)(x8 >> 32);
+}
+#endif
+
+#endif /* AVUTIL_X86_CRC_H */
-- 
2.49.1


>From f773d44580f79c8dba0e09e77aa74e4b9307bb95 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 7 Dec 2025 11:46:41 +0100
Subject: [PATCH 5/5] avutil/crc: Use x86 clmul for CRC when available

Observed near 10x speedup on AMD Zen4 7950x:
av_crc_c:                                            22057.0 ( 1.00x)
av_crc_clmul:                                         2202.8 (10.01x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/crc.c        | 24 ++++++++++++
 libavutil/x86/Makefile |  1 +
 libavutil/x86/crc.asm  | 11 +++---
 libavutil/x86/crc.h    | 88 ++++++++++++++++++++++++++++++++++--------
 4 files changed, 102 insertions(+), 22 deletions(-)

diff --git a/libavutil/crc.c b/libavutil/crc.c
index 703b56f4e0..be11516b84 100644
--- a/libavutil/crc.c
+++ b/libavutil/crc.c
@@ -25,6 +25,9 @@
 #include "bswap.h"
 #include "crc.h"
 #include "error.h"
+#if ARCH_X86
+#include "libavutil/x86/crc.h"
+#endif
 
 #if CONFIG_HARDCODED_TABLES
 static const AVCRC av_crc_table[AV_CRC_MAX][257] = {
@@ -348,6 +351,12 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t 
poly, int ctx_size)
     if (ctx_size != sizeof(AVCRC) * 257 && ctx_size != sizeof(AVCRC) * 1024)
         return AVERROR(EINVAL);
 
+#if ARCH_X86
+    int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
+    if (done)
+        return 0;
+#endif
+
     for (i = 0; i < 256; i++) {
         if (le) {
             for (c = i, j = 0; j < 8; j++)
@@ -373,6 +382,14 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t 
poly, int ctx_size)
 
 const AVCRC *av_crc_get_table(AVCRCId crc_id)
 {
+// Check for arch-specific extensions first to avoid initializing
+// ordinary CRC tables unnecessarily.
+#if ARCH_X86
+    const AVCRC *table = ff_crc_get_table_x86(crc_id);
+    if (table)
+        return table;
+#endif
+
 #if !CONFIG_HARDCODED_TABLES
     switch (crc_id) {
     case AV_CRC_8_ATM:      CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM); break;
@@ -392,6 +409,13 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id)
 uint32_t av_crc(const AVCRC *ctx, uint32_t crc,
                 const uint8_t *buffer, size_t length)
 {
+    if (ctx[0]) {
+#if ARCH_X86
+        return ff_crc_x86(ctx, crc, buffer, length);
+#endif
+    }
+    av_assert2(ctx[0] == 0);
+
     const uint8_t *end = buffer + length;
 
 #if !CONFIG_SMALL
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 4e1b4b1176..901298b6cb 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -4,6 +4,7 @@ 
EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
 
 X86ASM-OBJS += x86/aes.o x86/aes_init.o                                 \
                x86/cpuid.o                                              \
+               x86/crc.o                                                \
                $(EMMS_OBJS__yes_)                                       \
                x86/fixed_dsp.o x86/fixed_dsp_init.o                     \
                x86/float_dsp.o x86/float_dsp_init.o                     \
diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm
index fd06acfdd2..8f9b54544d 100644
--- a/libavutil/x86/crc.asm
+++ b/libavutil/x86/crc.asm
@@ -138,6 +138,7 @@ SECTION .text
 %endmacro
 
 %macro CRC 1
+%define CTX r0+4
 
;-----------------------------------------------------------------------------------------------
 ; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, 
size_t length
 
;-----------------------------------------------------------------------------------------------
@@ -177,7 +178,7 @@ cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
     mov    r4, 64
     cmp    r3, 128
     jb    .reduce_4x_to_1
-    movu   m4, [r0]
+    movu   m4, [CTX]
 
 .fold_4x_loop:
         movu        m6, [r2 + r4 +  0]
@@ -200,7 +201,7 @@ cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
         jbe        .fold_4x_loop
 
 .reduce_4x_to_1:
-    movu        m4, [r0 + 16]
+    movu        m4, [CTX + 16]
     FOLD_SINGLE m5, m1, m4, m3
     FOLD_SINGLE m5, m1, m4, m2
     FOLD_SINGLE m5, m1, m4, m0
@@ -245,10 +246,10 @@ cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
     FOLD_SINGLE m5, m1, m4, m2
 
 .reduce_128_to_64:
-    movu           m4, [r0 + 32]
+    movu           m4, [CTX + 32]
     FOLD_128_TO_64 %1, m1, m4, m5
 .reduce_64_to_32:
-    movu           m4, [r0 + 48]
+    movu           m4, [CTX + 48]
     FOLD_64_TO_32  %1, m1, m4, m5
     RET
 
@@ -261,7 +262,7 @@ cglobal crc,    4, 6, 6+4*ARCH_X86_64, 0x10
     pshufb m1, m10
 %endif
     mov    r4, 16
-    movu   m4, [r0 + 16]
+    movu   m4, [CTX + 16]
     jmp   .fold_1x_pre
 
 .less_than_16bytes:
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
index 936ca54d37..a5799c5409 100644
--- a/libavutil/x86/crc.h
+++ b/libavutil/x86/crc.h
@@ -24,6 +24,7 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/attributes_internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/crc.h"
 #include "libavutil/reverse.h"
@@ -37,50 +38,64 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
                          const uint8_t *buffer, size_t length);
 FF_VISIBILITY_POP_HIDDEN
 
-static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = {
+enum {
+    CRC_C    = 0,
+    CLMUL_BE,
+    CLMUL_LE,
+};
+
+static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = {
     [AV_CRC_8_ATM] = {
+        CLMUL_BE,
         0x32000000, 0x0, 0xbc000000, 0x0,
         0xc4000000, 0x0, 0x94000000, 0x0,
         0x62000000, 0x0, 0x79000000, 0x0,
         0x07156a16, 0x1, 0x07000000, 0x1,
     },
     [AV_CRC_8_EBU] = {
+        CLMUL_BE,
         0xb5000000, 0x0, 0xf3000000, 0x0,
         0xfc000000, 0x0, 0x0d000000, 0x0,
         0x6a000000, 0x0, 0x65000000, 0x0,
         0x1c4b8192, 0x1, 0x1d000000, 0x1,
     },
     [AV_CRC_16_ANSI] = {
+        CLMUL_BE,
         0xf9e30000, 0x0, 0x807d0000, 0x0,
         0xf9130000, 0x0, 0xff830000, 0x0,
         0x807b0000, 0x0, 0x86630000, 0x0,
         0xfffbffe7, 0x1, 0x80050000, 0x1,
     },
     [AV_CRC_16_CCITT] = {
+        CLMUL_BE,
         0x60190000, 0x0, 0x59b00000, 0x0,
         0xd5f60000, 0x0, 0x45630000, 0x0,
         0xaa510000, 0x0, 0xeb230000, 0x0,
         0x11303471, 0x1, 0x10210000, 0x1,
     },
     [AV_CRC_24_IEEE] = {
+        CLMUL_BE,
         0x1f428700, 0x0, 0x467d2400, 0x0,
         0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
         0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
         0xf845fe24, 0x1, 0x864cfb00, 0x1,
     },
     [AV_CRC_32_IEEE] = {
+        CLMUL_BE,
         0x8833794c, 0x0, 0xe6228b11, 0x0,
         0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
         0x490d678d, 0x0, 0xf200aa66, 0x0,
         0x04d101df, 0x1, 0x04c11db7, 0x1,
     },
     [AV_CRC_32_IEEE_LE] = {
+        CLMUL_LE,
         0xc6e41596, 0x1, 0x54442bd4, 0x1,
         0xccaa009e, 0x0, 0x751997d0, 0x1,
         0xccaa009e, 0x0, 0x63cd6124, 0x1,
         0xf7011640, 0x1, 0xdb710641, 0x1,
     },
     [AV_CRC_16_ANSI_LE] = {
+        CLMUL_LE,
         0x0000bffa, 0x0, 0x1b0c2, 0x0,
         0x00018cc2, 0x0, 0x1d0c2, 0x0,
         0x00018cc2, 0x0, 0x1bc02, 0x0,
@@ -158,23 +173,62 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int 
bits, uint32_t poly, int
         x6 = xnmodp(96, poly_, 32, &div, le);
         x8 = poly_ | (1ULL << 32);
     }
-    ctx[0]  = (AVCRC)x1;
-    ctx[1]  = (AVCRC)(x1 >> 32);
-    ctx[2]  = (AVCRC)x2;
-    ctx[3]  = (AVCRC)(x2 >> 32);
-    ctx[4]  = (AVCRC)x3;
-    ctx[5]  = (AVCRC)(x3 >> 32);
-    ctx[6]  = (AVCRC)x4;
-    ctx[7]  = (AVCRC)(x4 >> 32);
-    ctx[8]  = (AVCRC)x5;
-    ctx[9]  = (AVCRC)(x5 >> 32);
-    ctx[10] = (AVCRC)x6;
-    ctx[11] = (AVCRC)(x6 >> 32);
-    ctx[12] = (AVCRC)x7;
-    ctx[13] = (AVCRC)(x7 >> 32);
-    ctx[14] = (AVCRC)x8;
-    ctx[15] = (AVCRC)(x8 >> 32);
+    ctx[0]  = le ? CLMUL_LE : CLMUL_BE;
+    ctx[1]  = (AVCRC)x1;
+    ctx[2]  = (AVCRC)(x1 >> 32);
+    ctx[3]  = (AVCRC)x2;
+    ctx[4]  = (AVCRC)(x2 >> 32);
+    ctx[5]  = (AVCRC)x3;
+    ctx[6]  = (AVCRC)(x3 >> 32);
+    ctx[7]  = (AVCRC)x4;
+    ctx[8]  = (AVCRC)(x4 >> 32);
+    ctx[9]  = (AVCRC)x5;
+    ctx[10] = (AVCRC)(x5 >> 32);
+    ctx[11] = (AVCRC)x6;
+    ctx[12] = (AVCRC)(x6 >> 32);
+    ctx[13] = (AVCRC)x7;
+    ctx[14] = (AVCRC)(x7 >> 32);
+    ctx[15] = (AVCRC)x8;
+    ctx[16] = (AVCRC)(x8 >> 32);
 }
 #endif
 
+static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id)
+{
+#if HAVE_CLMUL_EXTERNAL
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_CLMUL(cpu_flags)) {
+        return crc_table_clmul[crc_id];
+    }
+#endif
+    return NULL;
+}
+
+static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
+{
+#if HAVE_CLMUL_EXTERNAL
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_CLMUL(cpu_flags)) {
+        crc_init_x86(ctx, le, bits, poly, ctx_size);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+static inline uint32_t ff_crc_x86(const AVCRC *ctx, uint32_t crc,
+                                  const uint8_t *buffer, size_t length)
+{
+    switch (ctx[0]) {
+#if HAVE_CLMUL_EXTERNAL
+    case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length);
+    case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length);
+#endif
+    default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE");
+    }
+    return 0;
+}
+
 #endif /* AVUTIL_X86_CRC_H */
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] avutil/crc: add x86 SSE4.2 clmul (PR #21119)

Reply via email to