[FFmpeg-devel] [PR] avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD implementation for av_crc (PR #21651)

Shreesh Adiga via ffmpeg-devel Thu, 05 Feb 2026 05:38:28 -0800

PR #21651 opened by Shreesh Adiga (tantei3)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21651
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21651.patch


Implemented #20751 for aarch64 similar to clmul x86 implementation using PMULL 
(aes) and EOR3 (sha3) instructions.
Developed and tested on Android Termux on MediaTek Dimensity 9400 SoC which 
reported about 30x speedup:
```
./tests/checkasm/checkasm --test=crc --bench --runs=12
benchmarking with native FFmpeg timers
nop: 0.2
checkasm: SVE 128 bits, using random seed 2502847808
checkasm: bench runs 4096 (1 << 12)
CRC:
 - crc.crc [OK]
PMULL:
 - crc.crc [OK]
checkasm: all 10 tests passed
crc_8_ATM_c:                                            26.0 ( 1.00x)
crc_8_ATM_pmull_eor3:                                    0.7 (37.17x)
crc_8_EBU_c:                                            46.4 ( 1.00x)
crc_8_EBU_pmull_eor3:                                    1.5 (31.47x)
crc_16_ANSI_c:                                          36.3 ( 1.00x)
crc_16_ANSI_pmull_eor3:                                  1.1 (31.70x)
crc_16_ANSI_LE_c:                                       90.9 ( 1.00x)
crc_16_ANSI_LE_pmull_eor3:                               2.8 (32.30x)
crc_16_CCITT_c:                                        118.0 ( 1.00x)
crc_16_CCITT_pmull_eor3:                                 3.7 (32.00x)
crc_24_IEEE_c:                                           1.6 ( 1.00x)
crc_24_IEEE_pmull_eor3:                                  0.1 (12.19x)
crc_32_IEEE_c:                                          45.2 ( 1.00x)
crc_32_IEEE_pmull_eor3:                                  1.4 (31.39x)
crc_32_IEEE_LE_c:                                       49.1 ( 1.00x)
crc_32_IEEE_LE_crc:                                      2.5 (19.51x)
crc_32_IEEE_LE_pmull_eor3:                               1.5 (32.84x)
crc_custom_polynomial_c:                                45.3 ( 1.00x)
crc_custom_polynomial_pmull_eor3:                        1.3 (35.16x)
```


>From 7d54829f88cf8f7ff173378f09b77d361de7b52d Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Thu, 5 Feb 2026 18:22:17 +0530
Subject: [PATCH 1/3] avutil/cpu: add aarch64 CPU feature flag for PMULL and
 EOR3

---
 configure                 | 14 +++++++++++++-
 libavutil/aarch64/asm.S   | 18 ++++++++++++++++++
 libavutil/aarch64/cpu.c   |  4 ++++
 libavutil/aarch64/cpu.h   |  8 +++++---
 libavutil/cpu.c           |  2 ++
 libavutil/cpu.h           |  2 ++
 tests/checkasm/checkasm.c |  1 +
 7 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/configure b/configure
index 31e1bf3600..51f179c973 100755
--- a/configure
+++ b/configure
@@ -481,6 +481,8 @@ Optimization options (experts only):
   --disable-arm-crc        disable ARM/AArch64 CRC optimizations
   --disable-dotprod        disable DOTPROD optimizations
   --disable-i8mm           disable I8MM optimizations
+  --disable-pmull          disable PMULL optimizations
+  --disable-eor3           disable EOR3 optimizations
   --disable-sve            disable SVE optimizations
   --disable-sve2           disable SVE2 optimizations
   --disable-sme            disable SME optimizations
@@ -2261,6 +2263,8 @@ ARCH_EXT_LIST_ARM="
     arm_crc
     dotprod
     i8mm
+    pmull
+    eor3
     neon
     vfp
     vfpv3
@@ -2535,6 +2539,8 @@ TOOLCHAIN_FEATURES="
     as_archext_crc_directive
     as_archext_dotprod_directive
     as_archext_i8mm_directive
+    as_archext_sha3_directive
+    as_archext_aes_directive
     as_archext_sve_directive
     as_archext_sve2_directive
     as_archext_sme_directive
@@ -2873,6 +2879,8 @@ setend_deps="arm"
 arm_crc_deps="aarch64"
 dotprod_deps="aarch64 neon"
 i8mm_deps="aarch64 neon"
+pmull_deps="aarch64 neon"
+eor3_deps="aarch64 neon"
 sve_deps="aarch64 neon"
 sve2_deps="aarch64 neon sve"
 sme_deps="aarch64 neon sve sve2"
@@ -6508,8 +6516,10 @@ if enabled aarch64; then
     # internal assembler in clang 3.3 does not support this instruction
     enabled neon && check_insn neon 'ext   v0.8B, v0.8B, v1.8B, #1'
 
-    archext_list="arm_crc dotprod i8mm sve sve2 sme"
+    archext_list="arm_crc dotprod i8mm pmull eor3 sve sve2 sme"
     enabled arm_crc && check_archext_name_insn arm_crc crc 'crc32x w0, w0, x0'
+    enabled pmull   && check_archext_name_insn pmull aes 'pmull v0.1q, v0.1d, 
v0.1d'
+    enabled eor3    && check_archext_name_insn eor3 sha3 'eor3 v0.16b, v1.16b, 
v2.16b, v3.16b'
     enabled dotprod && check_archext_insn dotprod 'udot v0.4s, v0.16b, v0.16b'
     enabled i8mm    && check_archext_insn i8mm    'usdot v0.4s, v0.16b, v0.16b'
     enabled sve     && check_archext_insn sve     'whilelt p0.s, x0, x1'
@@ -8319,6 +8329,8 @@ if enabled aarch64; then
     echo "NEON enabled              ${neon-no}"
     echo "DOTPROD enabled           ${dotprod-no}"
     echo "I8MM enabled              ${i8mm-no}"
+    echo "PMULL enabled             ${pmull-no}"
+    echo "EOR3 enabled              ${eor3-no}"
     echo "SVE enabled               ${sve-no}"
     echo "SVE2 enabled              ${sve2-no}"
     echo "SME enabled               ${sme-no}"
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index d1e118d7d1..7f54278064 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -64,6 +64,22 @@
 #define DISABLE_I8MM
 #endif
 
+#if HAVE_AS_ARCHEXT_AES_DIRECTIVE
+#define ENABLE_PMULL  .arch_extension aes
+#define DISABLE_PMULL .arch_extension noaes
+#else
+#define ENABLE_PMULL
+#define DISABLE_PMULL
+#endif
+
+#if HAVE_AS_ARCHEXT_SHA3_DIRECTIVE
+#define ENABLE_EOR3  .arch_extension sha3
+#define DISABLE_EOR3 .arch_extension nosha3
+#else
+#define ENABLE_EOR3
+#define DISABLE_EOR3
+#endif
+
 #if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
 #define ENABLE_SVE  .arch_extension sve
 #define DISABLE_SVE .arch_extension nosve
@@ -89,6 +105,8 @@
 #endif
 
 DISABLE_ARM_CRC
+DISABLE_PMULL
+DISABLE_EOR3
 DISABLE_DOTPROD
 DISABLE_I8MM
 DISABLE_SVE
diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c
index 6733b62123..0c4ea442c8 100644
--- a/libavutil/aarch64/cpu.c
+++ b/libavutil/aarch64/cpu.c
@@ -24,7 +24,9 @@
 #include <stdint.h>
 #include <sys/auxv.h>
 
+#define HWCAP_AARCH64_PMULL   (1 << 4)
 #define HWCAP_AARCH64_CRC32   (1 << 7)
+#define HWCAP_AARCH64_SHA3    (1 << 17)
 #define HWCAP_AARCH64_ASIMDDP (1 << 20)
 #define HWCAP_AARCH64_SVE     (1 << 22)
 #define HWCAP2_AARCH64_SVE2   (1 << 1)
@@ -38,6 +40,8 @@ static int detect_flags(void)
     unsigned long hwcap = ff_getauxval(AT_HWCAP);
     unsigned long hwcap2 = ff_getauxval(AT_HWCAP2);
 
+    if (hwcap & (HWCAP_AARCH64_PMULL | HWCAP_AARCH64_SHA3))
+        flags |= AV_CPU_FLAG_PMULL | AV_CPU_FLAG_EOR3;
     if (hwcap & HWCAP_AARCH64_CRC32)
         flags |= AV_CPU_FLAG_ARM_CRC;
     if (hwcap & HWCAP_AARCH64_ASIMDDP)
diff --git a/libavutil/aarch64/cpu.h b/libavutil/aarch64/cpu.h
index e1fc625e0f..37ff177836 100644
--- a/libavutil/aarch64/cpu.h
+++ b/libavutil/aarch64/cpu.h
@@ -22,10 +22,12 @@
 #include "libavutil/cpu.h"
 #include "libavutil/cpu_internal.h"
 
-#define have_armv8(flags) CPUEXT(flags, ARMV8)
-#define have_neon(flags) CPUEXT(flags, NEON)
-#define have_vfp(flags)  CPUEXT(flags, VFP)
+#define have_armv8(flags)   CPUEXT(flags, ARMV8)
+#define have_neon(flags)    CPUEXT(flags, NEON)
+#define have_vfp(flags)     CPUEXT(flags, VFP)
 #define have_arm_crc(flags) CPUEXT(flags, ARM_CRC)
+#define have_pmull(flags)   CPUEXT(flags, PMULL)
+#define have_eor3(flags)    CPUEXT(flags, EOR3)
 #define have_dotprod(flags) CPUEXT(flags, DOTPROD)
 #define have_i8mm(flags)    CPUEXT(flags, I8MM)
 #define have_sve(flags)     CPUEXT(flags, SVE)
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 03e2720a7f..648e086252 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -189,6 +189,8 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "sve2",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SVE2    
 },    .unit = "flags" },
         { "sme",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SME     
 },    .unit = "flags" },
         { "crc",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ARM_CRC 
 },    .unit = "flags" },
+        { "pmull",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_PMULL   
 },    .unit = "flags" },
+        { "eor3",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_EOR3    
 },    .unit = "flags" },
 #elif ARCH_MIPS
         { "mmi",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMI     
 },    .unit = "flags" },
         { "msa",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MSA     
 },    .unit = "flags" },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index 58157ea208..4c245a2795 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -79,6 +79,8 @@
 #define AV_CPU_FLAG_SVE2         (1 <<11)
 #define AV_CPU_FLAG_SME          (1 <<12)
 #define AV_CPU_FLAG_ARM_CRC      (1 <<13)
+#define AV_CPU_FLAG_PMULL        (1 <<14)
+#define AV_CPU_FLAG_EOR3         (1 <<15)
 #define AV_CPU_FLAG_SETEND       (1 <<16)
 
 #define AV_CPU_FLAG_MMI          (1 << 0)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index bdaaa8695d..489b33db3d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -375,6 +375,7 @@ static const struct {
     { "SVE2",     "sve2",     AV_CPU_FLAG_SVE2 },
     { "SME",      "sme",      AV_CPU_FLAG_SME },
     { "CRC",      "crc",      AV_CPU_FLAG_ARM_CRC },
+    { "PMULL",    "pmull_eor3", AV_CPU_FLAG_PMULL|AV_CPU_FLAG_EOR3 },
 #elif ARCH_ARM
     { "ARMV5TE",  "armv5te",  AV_CPU_FLAG_ARMV5TE },
     { "ARMV6",    "armv6",    AV_CPU_FLAG_ARMV6 },
-- 
2.52.0


>From 9bb7108d7d59221f836c7ac4ae3e7c1481e0ff73 Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Thu, 5 Feb 2026 18:23:18 +0530
Subject: [PATCH 2/3] avutil/crc: refactor helper functions to separate header
 file

Move the reverse and xnmodp functions to a separate header
so that it can be reused for aarch64 implementation of av_crc.
---
 libavutil/crc_internal.h | 66 ++++++++++++++++++++++++++++++++++++++++
 libavutil/x86/crc.h      | 44 ++-------------------------
 2 files changed, 68 insertions(+), 42 deletions(-)
 create mode 100644 libavutil/crc_internal.h

diff --git a/libavutil/crc_internal.h b/libavutil/crc_internal.h
new file mode 100644
index 0000000000..8a856990a9
--- /dev/null
+++ b/libavutil/crc_internal.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_CRC_INTERNAL_H
+#define AVUTIL_CRC_INTERNAL_H
+
+#include <stdint.h>
+#include "libavutil/reverse.h"
+
+static uint64_t reverse(uint64_t p, unsigned int deg)
+{
+    uint64_t ret = 0;
+    int i;
+    for (i = 0; i < (deg / 8); i += 1) {
+        ret = (ret << 8) | (ff_reverse[p & 0xff]);
+        p >>= 8;
+    }
+    int rem = (deg + 1) - 8 * i;
+    ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem));
+    return ret;
+}
+
+static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, 
int bitreverse)
+{
+    uint64_t mod, mask, high;
+
+    if (n < deg) {
+        *div = 0;
+        return poly;
+    }
+    mask = ((uint64_t)1 << deg) - 1;
+    poly &= mask;
+    mod = poly;
+    *div = 1;
+    deg--;
+    while (--n > deg) {
+        high = (mod >> deg) & 1;
+        *div = (*div << 1) | high;
+        mod <<= 1;
+        if (high)
+            mod ^= poly;
+    }
+    uint64_t ret = mod & mask;
+    if (bitreverse) {
+        *div = reverse(*div, deg) << 1;
+        return reverse(ret, deg) << 1;
+    }
+    return ret;
+}
+
+#endif /* AVUTIL_CRC_INTERNAL_H */
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
index c836c090c6..ef98ed318d 100644
--- a/libavutil/x86/crc.h
+++ b/libavutil/x86/crc.h
@@ -28,10 +28,11 @@
 #include "libavutil/cpu.h"
 #include "libavutil/crc.h"
 #include "libavutil/intreadwrite.h"
-#include "libavutil/reverse.h"
 #include "libavutil/x86/cpu.h"
 
 #if HAVE_CLMUL_EXTERNAL
+#include "libavutil/crc_internal.h"
+
 FF_VISIBILITY_PUSH_HIDDEN
 uint32_t ff_crc_clmul(const AVCRC *ctx, uint32_t crc,
                       const uint8_t *buffer, size_t length);
@@ -104,47 +105,6 @@ static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = {
     },
 };
 
-static uint64_t reverse(uint64_t p, unsigned int deg)
-{
-    uint64_t ret = 0;
-    int i;
-    for (i = 0; i < (deg / 8); i += 1) {
-        ret = (ret << 8) | (ff_reverse[p & 0xff]);
-        p >>= 8;
-    }
-    int rem = (deg + 1) - 8 * i;
-    ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem));
-    return ret;
-}
-
-static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, 
int bitreverse)
-{
-    uint64_t mod, mask, high;
-
-    if (n < deg) {
-        *div = 0;
-        return poly;
-    }
-    mask = ((uint64_t)1 << deg) - 1;
-    poly &= mask;
-    mod = poly;
-    *div = 1;
-    deg--;
-    while (--n > deg) {
-        high = (mod >> deg) & 1;
-        *div = (*div << 1) | high;
-        mod <<= 1;
-        if (high)
-            mod ^= poly;
-    }
-    uint64_t ret = mod & mask;
-    if (bitreverse) {
-        *div = reverse(*div, deg) << 1;
-        return reverse(ret, deg) << 1;
-    }
-    return ret;
-}
-
 static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, 
int ctx_size)
 {
     uint64_t poly_;
-- 
2.52.0


>From dfd37fd7ed02557f4e721a44e194a5993f6f0f90 Mon Sep 17 00:00:00 2001
From: Shreesh Adiga <[email protected]>
Date: Thu, 5 Feb 2026 18:49:21 +0530
Subject: [PATCH 3/3] avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD
 implementation for av_crc

Implemented clmul algorithm for aarch64 using PMULL and EOR3 instructions.
The logic and structure is same as x86 clmul implementation with
slight rearrangement of constants as per PMULL and PMULL2 instructions.

Benchmarking in Android (Termux) on a MediaTek Dimensity 9400 SoC:

./tests/checkasm/checkasm --test=crc --bench --runs=12
benchmarking with native FFmpeg timers
nop: 0.2
checkasm: SVE 128 bits, using random seed 2502847808
checkasm: bench runs 4096 (1 << 12)
CRC:
 - crc.crc [OK]
PMULL:
 - crc.crc [OK]
checkasm: all 10 tests passed
crc_8_ATM_c:                                            26.0 ( 1.00x)
crc_8_ATM_pmull_eor3:                                    0.7 (37.17x)
crc_8_EBU_c:                                            46.4 ( 1.00x)
crc_8_EBU_pmull_eor3:                                    1.5 (31.47x)
crc_16_ANSI_c:                                          36.3 ( 1.00x)
crc_16_ANSI_pmull_eor3:                                  1.1 (31.70x)
crc_16_ANSI_LE_c:                                       90.9 ( 1.00x)
crc_16_ANSI_LE_pmull_eor3:                               2.8 (32.30x)
crc_16_CCITT_c:                                        118.0 ( 1.00x)
crc_16_CCITT_pmull_eor3:                                 3.7 (32.00x)
crc_24_IEEE_c:                                           1.6 ( 1.00x)
crc_24_IEEE_pmull_eor3:                                  0.1 (12.19x)
crc_32_IEEE_c:                                          45.2 ( 1.00x)
crc_32_IEEE_pmull_eor3:                                  1.4 (31.39x)
crc_32_IEEE_LE_c:                                       49.1 ( 1.00x)
crc_32_IEEE_LE_crc:                                      2.5 (19.51x)
crc_32_IEEE_LE_pmull_eor3:                               1.5 (32.84x)
crc_custom_polynomial_c:                                45.3 ( 1.00x)
crc_custom_polynomial_pmull_eor3:                        1.3 (35.16x)
---
 libavutil/aarch64/crc.S | 265 ++++++++++++++++++++++++++++++++++++++++
 libavutil/aarch64/crc.h | 152 +++++++++++++++++++++--
 libavutil/crc.c         |   4 +
 3 files changed, 414 insertions(+), 7 deletions(-)

diff --git a/libavutil/aarch64/crc.S b/libavutil/aarch64/crc.S
index 892c7ff229..b9af2053c4 100644
--- a/libavutil/aarch64/crc.S
+++ b/libavutil/aarch64/crc.S
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2025 Zhao Zhili <[email protected]>
+ * Copyright (c) 2026 Shreesh Adiga <[email protected]>
  *
  * This file is part of FFmpeg.
  *
@@ -67,3 +68,267 @@ endfunc
 
 DISABLE_ARM_CRC
 #endif
+
+#if HAVE_PMULL && HAVE_EOR3
+ENABLE_PMULL
+ENABLE_EOR3
+
+const reverse_shuffle, align=4
+        .byte  15, 14, 13, 12
+        .byte  11, 10,  9,  8
+        .byte   7,  6,  5,  4
+        .byte   3,  2,  1,  0
+endconst
+
+const partial_bytes_shuf_tab, align=4
+        .byte  16, 17, 18, 19
+        .byte  20, 21, 22, 23
+        .byte  24, 25, 26, 27
+        .byte  28, 29, 30, 31
+        .byte   0,  1,  2,  3
+        .byte   4,  5,  6,  7
+        .byte   8,  9, 10, 11
+        .byte  12, 13, 14, 15
+endconst
+
+// performs Vfold = pmull(Vfold, Vconst) xor pmull2(Vfold, Vconst) xor Vdata
+.macro FOLD_SINGLE Vfold, Vconst, Vdata, Vtemp
+        pmull           \Vtemp\().1q, \Vconst\().1d, \Vfold\().1d
+        pmull2          \Vfold\().1q, \Vconst\().2d, \Vfold\().2d
+        eor3            \Vfold\().16b, \Vfold\().16b, \Vdata\().16b, 
\Vtemp\().16b
+.endm
+
+// assume Vfold is v16 and v0 is filled with 0
+// uses v17 as temp
+.macro FOLD_128_TO_64 le Vconst
+.if ! \le
+        fmov            d17, d16
+        pmull2          v16.1q, v16.2d, \Vconst\().2d
+        ext             v17.16b, v0.16b, v17.16b, #12
+        eor             v16.16b, v17.16b, v16.16b
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull2          v17.1q, \Vconst\().2d, v16.2d
+        eor             v16.16b, v16.16b, v17.16b
+.else
+        ext             v17.16b, v16.16b, v0.16b, #8
+        pmull           v16.1q, v16.1d, \Vconst\().1d
+        eor             v16.16b, v16.16b, v17.16b
+        ext             v17.16b, v0.16b, v16.16b, #12
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull           v17.1q, \Vconst\().1d, v17.1d
+        eor             v16.16b, v16.16b, v17.16b
+.endif
+.endm
+
+// assume Vfold is v16 and v0 is filled with 0
+// uses v17 as temp
+.macro FOLD_64_TO_32 le Vconst
+.if ! \le
+        pmull           v17.1q, v16.1d, \Vconst\().1d
+        pmull2          v17.1q, v17.2d, \Vconst\().2d
+        eor             v16.16b, v16.16b, v17.16b
+        fmov            w0, s16
+        rev             w0, w0
+.else
+        mov             v16.s[0], wzr
+        pmull           v17.1q, v16.1d, \Vconst\().1d
+        eor             v17.16b, v17.16b, v16.16b
+        ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
+        pmull           v17.1q, v17.1d, \Vconst\().1d
+        eor             v16.16b, v16.16b, v17.16b
+        mov             w0, v16.s[2]
+.endif
+.endm
+
+#define CTX_OFFSET 4
+
+// ff_crc[_le]_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t 
*buffer, size_t length)
+// x0 - ctx; pre-computed fold constants, see crc_init_aarch64 for details on 
the structure.
+// x1 - crc initial value
+// x2 - buffer pointer
+// x3 - length of buffer
+//
+// Additional registers used:
+// x10 - buffer + length; points to end of buffer
+// x11-x15 used as temp registers as needed
+//
+// v0 - zero register to perform vector shift during reduction using EXT
+// v1 - crc input from x1
+// v2 - holds the reverse shuffle values 15, 14 ... 0 to reverse vector using 
TBL
+// v3 - holds the constant values used in PMULL/PMULL2 loaded via x0 ctx
+// v4 - used as temp for PMULL/PMULL2 output
+// v16 - holds the primary fold register later reduced to 32 bits
+// Registers v16-v19 are used in parallel 4x fold loop
+// Registers v20-23 are used to load the 4 next data chunks
+// Registers v5-v6 and v17-v22 are used as temp elsewhere
+.macro crc_fn_template le
+.if \le
+function ff_crc_le_neon_pmull, export=1
+.else
+function ff_crc_neon_pmull, export=1
+        movrel          x9, reverse_shuffle
+        ldr             q2, [x9]
+.endif
+        add             x10, x2, x3
+        movi            v0.2d, #0
+        mov             v1.16b, v0.16b
+        mov             v1.s[0], w1
+        cmp             x3, #64
+        b.lo            8f // less than 64 bytes
+        ld1             {v16.16b-v19.16b}, [x2], #64
+        eor             v16.16b, v16.16b, v1.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+        tbl             v17.16b, { v17.16b }, v2.16b
+        tbl             v18.16b, { v18.16b }, v2.16b
+        tbl             v19.16b, { v19.16b }, v2.16b
+.endif
+        sub             x11, x10, x2
+        cmp             x11, #64
+        b.lo            2f // reduce 4x to 1x
+        ldr             q3, [x0, #(CTX_OFFSET + 0)]
+
+1: // fold 4x loop
+        ld1             {v20.16b-v23.16b}, [x2], #64
+.if ! \le
+        tbl             v20.16b, { v20.16b }, v2.16b
+        tbl             v21.16b, { v21.16b }, v2.16b
+        tbl             v22.16b, { v22.16b }, v2.16b
+        tbl             v23.16b, { v23.16b }, v2.16b
+.endif
+        FOLD_SINGLE     v16, v3, v20, v4
+        FOLD_SINGLE     v17, v3, v21, v4
+        FOLD_SINGLE     v18, v3, v22, v4
+        FOLD_SINGLE     v19, v3, v23, v4
+        sub             x11, x10, x2
+        cmp             x11, #64
+        b.hs            1b // fold 4x loop
+
+2: // reduce 4x to 1x
+        ldr             q3, [x0, #(CTX_OFFSET + 16)]
+        FOLD_SINGLE     v16, v3, v17, v4
+        FOLD_SINGLE     v16, v3, v18, v4
+        FOLD_SINGLE     v16, v3, v19, v4
+
+3: // fold 1x pre
+        sub             x11, x10, x2
+        cmp             x11, #16
+        b.lo            5f
+
+4: // fold 1x loop
+        ldr             q17, [x2], #16
+.if ! \le
+        tbl             v17.16b, { v17.16b }, v2.16b
+.endif
+        FOLD_SINGLE     v16, v3, v17, v4
+        sub             x11, x10, x2
+        cmp             x11, #16
+        b.hs            4b // fold 1x loop
+
+5: // partial_block
+        cmp             x10, x2
+        b.eq            6f // reduce 128 to 64
+        ldr             q17, [x10, #-16]
+        movrel          x1, partial_bytes_shuf_tab
+        ldr             q18, [x1, x11]
+        mov             w12, #16
+        sub             w11, w12, w11
+        dup             v5.16b, w11
+        movi            v6.16b, #16
+        cmhi            v19.16b, v6.16b, v18.16b
+        add             v20.16b, v18.16b, v5.16b
+        bsl             v19.16b, v20.16b, v18.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        tbl             v21.16b, { v16.16b }, v18.16b
+.if ! \le
+        tbl             v21.16b, { v21.16b }, v2.16b
+.endif
+        mov             v18.16b, v16.16b
+        tbl             v22.16b, { v17.16b, v18.16b }, v19.16b
+.if ! \le
+        tbl             v22.16b, { v22.16b }, v2.16b
+.endif
+        FOLD_SINGLE     v21, v3, v22, v4
+        mov             v16.16b, v21.16b
+
+6: // reduce 128 to 64
+        ldr             q3, [x0, #(CTX_OFFSET + 32)]
+        FOLD_128_TO_64  \le, v3
+
+7: // reduce 64 to 32
+        ldr             q3, [x0, #(CTX_OFFSET + 48)]
+        FOLD_64_TO_32   \le, v3
+        ret
+
+8: // less than 64 bytes
+        cmp             x3, #16
+        b.lo            9f // less than 16 bytes
+        ldr             q16, [x2], #16
+        eor             v16.16b, v16.16b, v1.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        ldr             q3, [x0, #(CTX_OFFSET + 16)]
+        b               3b // fold 1x pre
+
+9: // less than 16 bytes
+        movrel          x15, partial_bytes_shuf_tab
+        add             x15, x15, x3
+        str             q0, [sp, #-16]!
+        add             x9, sp, x3
+       // memcpy 0 to 15 bytes from src to stack and load it to v16 with zero 
extension
+        tbz             x3, 3, 10f
+        ldr             x11, [x2]
+        ldr             x12, [x10, #-8]
+        str             x11, [sp]
+        str             x12, [x9, #-8]
+        b               12f
+10:
+        tbz             x3, 2, 11f
+        ldr             w11, [x2]
+        ldr             w12, [x10, #-4]
+        str             w11, [sp]
+        str             w12, [x9, #-4]
+        b               12f
+11:
+        cbz             x3, 12f
+        lsr             x11, x3, 1
+        ldrb            w12, [x2]
+        ldrb            w14, [x10, #-1]
+        ldrb            w13, [x2, x11]
+        strb            w12, [sp]
+        strb            w13, [sp, x11]
+        strb            w14, [x9, #-1]
+12:
+        ldr             q16, [sp], #16
+        eor             v16.16b, v16.16b, v1.16b
+        cmp             x3, #5
+        b.lo            13f // less than 5 bytes
+        ldr             q17, [x15]
+        tbl             v16.16b, { v16.16b }, v17.16b
+.if ! \le
+        tbl             v16.16b, { v16.16b }, v2.16b
+.endif
+        b               6b // reduce 128 to 64
+
+13: // less than 5 bytes
+.if ! \le
+        ldr             q17, [x15, #12]
+        tbl             v16.16b, { v16.16b }, v17.16b
+        rev64           v16.16b, v16.16b
+.else
+        ldr             q17, [x15, #8]
+        tbl             v16.16b, { v16.16b }, v17.16b
+.endif
+        b               7b // reduce 64 to 32
+endfunc
+.endm
+
+crc_fn_template 0
+crc_fn_template 1
+
+DISABLE_PMULL
+DISABLE_EOR3
+#endif
diff --git a/libavutil/aarch64/crc.h b/libavutil/aarch64/crc.h
index 08efdd28f3..8d4a232a38 100644
--- a/libavutil/aarch64/crc.h
+++ b/libavutil/aarch64/crc.h
@@ -29,26 +29,165 @@
 #include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
 
+#if HAVE_ARM_CRC
 FF_VISIBILITY_PUSH_HIDDEN
 uint32_t ff_crc32_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t 
*buffer,
                           size_t length);
 FF_VISIBILITY_POP_HIDDEN
+#endif
+
+#if HAVE_PMULL && HAVE_EOR3
+#include "libavutil/crc_internal.h"
+
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t 
*buffer,
+                           size_t length);
+uint32_t ff_crc_le_neon_pmull(const AVCRC *ctx, uint32_t crc, const uint8_t 
*buffer,
+                              size_t length);
+FF_VISIBILITY_POP_HIDDEN
+
+enum {
+    CRC_C    = 0,
+    PMULL_BE,
+    PMULL_LE,
+};
+
+static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = {
+    [AV_CRC_8_ATM] = {
+        PMULL_BE,
+        0xbc000000, 0x0, 0x32000000, 0x0,
+        0x94000000, 0x0, 0xc4000000, 0x0,
+        0x62000000, 0x0, 0x79000000, 0x0,
+        0x07156a16, 0x1, 0x07000000, 0x1,
+    },
+    [AV_CRC_8_EBU] = {
+        PMULL_BE,
+        0xf3000000, 0x0, 0xb5000000, 0x0,
+        0x0d000000, 0x0, 0xfc000000, 0x0,
+        0x6a000000, 0x0, 0x65000000, 0x0,
+        0x1c4b8192, 0x1, 0x1d000000, 0x1,
+    },
+    [AV_CRC_16_ANSI] = {
+        PMULL_BE,
+        0x807d0000, 0x0, 0xf9e30000, 0x0,
+        0xff830000, 0x0, 0xf9130000, 0x0,
+        0x807b0000, 0x0, 0x86630000, 0x0,
+        0xfffbffe7, 0x1, 0x80050000, 0x1,
+    },
+    [AV_CRC_16_CCITT] = {
+        PMULL_BE,
+        0x59b00000, 0x0, 0x60190000, 0x0,
+        0x45630000, 0x0, 0xd5f60000, 0x0,
+        0xaa510000, 0x0, 0xeb230000, 0x0,
+        0x11303471, 0x1, 0x10210000, 0x1,
+    },
+    [AV_CRC_24_IEEE] = {
+        PMULL_BE,
+        0x467d2400, 0x0, 0x1f428700, 0x0,
+        0x64e4d700, 0x0, 0x2c8c9d00, 0x0,
+        0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
+        0xf845fe24, 0x1, 0x864cfb00, 0x1,
+    },
+    [AV_CRC_32_IEEE] = {
+        PMULL_BE,
+        0xe6228b11, 0x0, 0x8833794c, 0x0,
+        0xe8a45605, 0x0, 0xc5b9cd4c, 0x0,
+        0x490d678d, 0x0, 0xf200aa66, 0x0,
+        0x04d101df, 0x1, 0x04c11db7, 0x1,
+    },
+    [AV_CRC_32_IEEE_LE] = {
+        PMULL_LE,
+        0x54442bd4, 0x1, 0xc6e41596, 0x1,
+        0x751997d0, 0x1, 0xccaa009e, 0x0,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+    },
+    [AV_CRC_16_ANSI_LE] = {
+        PMULL_LE,
+        0x1b0c2, 0x0, 0x0000bffa, 0x0,
+        0x1d0c2, 0x0, 0x00018cc2, 0x0,
+        0x00018cc2, 0x0, 0x1bc02, 0x0,
+        0xcfffbffe, 0x1, 0x14003, 0x0,
+    },
+};
+
+
+static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t 
poly, int ctx_size)
+{
+    uint64_t poly_;
+    if (le) {
+        // convert the reversed representation to regular form
+        poly = reverse(poly, bits) >> 1;
+    }
+    // convert to 32 degree polynomial
+    poly_ = ((uint64_t)poly) << (32 - bits);
+
+    uint64_t div;
+    uint8_t *dst = (uint8_t*)(ctx + 1);
+    if (le) {
+        ctx[0] = PMULL_LE;
+        AV_WN64(dst +  0, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 - 32, poly_, 32, &div, le));
+        uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
+        AV_WN64(dst + 16, xnmodp(128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst + 24, tmp);
+        AV_WN64(dst + 32, tmp);
+        AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
+    } else {
+        ctx[0] = PMULL_BE;
+        AV_WN64(dst +  0, xnmodp(4 * 128, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 16, xnmodp(128, poly_, 32, &div, le));
+        AV_WN64(dst + 24, xnmodp(128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
+        AV_WN64(dst + 56, poly_ | (1ULL << 32));
+    }
+}
+#endif
+
+static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
+{
+#if HAVE_PMULL && HAVE_EOR3
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
+        crc_init_aarch64(ctx, le, bits, poly, ctx_size);
+        return 1;
+    }
+#endif
+    return 0;
+}
 
 static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc,
                                       const uint8_t *buffer, size_t length)
 {
-#if HAVE_ARM_CRC
-    av_assert2(ctx[0] == AV_CRC_32_IEEE_LE + 1);
-    return ff_crc32_aarch64(ctx, crc, buffer, length);
-#else
-    av_unreachable("AARCH64 has only AV_CRC_32_IEEE_LE arch-specific CRC 
code");
-    return 0;
+    switch (ctx[0]) {
+#if HAVE_PMULL && HAVE_EOR3
+    case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length);
+    case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length);
 #endif
+#if HAVE_ARM_CRC
+    case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, 
length);
+#endif
+    default: av_unreachable("AARCH64 has PMULL_LE, PMULL_LE and 
AV_CRC_32_IEEE_LE arch-specific CRC code");
+    }
+    return 0;
 }
 
 static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id)
 {
+    int cpu_flags = av_get_cpu_flags();
+#if HAVE_PMULL && HAVE_EOR3
+    if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
+        return crc_table_pmull[crc_id];
+    }
+#endif
 #if HAVE_ARM_CRC
     static const AVCRC crc32_ieee_le_ctx[] = {
         AV_CRC_32_IEEE_LE + 1
@@ -57,7 +196,6 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId 
crc_id)
     if (crc_id != AV_CRC_32_IEEE_LE)
         return NULL;
 
-    int cpu_flags = av_get_cpu_flags();
     if (have_arm_crc(cpu_flags)) {
         return crc32_ieee_le_ctx;
     }
diff --git a/libavutil/crc.c b/libavutil/crc.c
index c8ccaf8162..335448f8db 100644
--- a/libavutil/crc.c
+++ b/libavutil/crc.c
@@ -357,6 +357,10 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t 
poly, int ctx_size)
     int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
     if (done)
         return 0;
+#elif ARCH_AARCH64
+    int done = ff_crc_init_aarch64(ctx, le, bits, poly, ctx_size);
+    if (done)
+        return 0;
 #endif
 
     for (i = 0; i < 256; i++) {
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avutil/crc: add aarch64 NEON PMULL+EOR3 SIMD implementation for av_crc (PR #21651)

Reply via email to