crc: add aarch64 hybrid crc32 NEON PMULL+EOR SIMD implementation

Shreesh Adiga via ffmpeg-cvslog Thu, 02 Jul 2026 02:05:50 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 915bac7bdc89317b85dc19e9cf6e7aed99be2e68
Author:     Shreesh Adiga <[email protected]>
AuthorDate: Mon Mar 23 17:25:00 2026 +0530
Commit:     Martin Storsjö <[email protected]>
CommitDate: Thu Jul 2 09:03:25 2026 +0000

    avutil/crc: add aarch64 hybrid crc32 NEON PMULL+EOR SIMD implementation
    
    Adding crc32 specialization for aarch64 which uses both PMULL and crc32
    instructions to perform 192 bytes fold in one iteration, performing
    9x PMULL and 6 crc32 in one loop iteration, obtaining higher performance for
    large inputs >8kB. This approach is based on zlib-ng implementation which
    is also described at https://github.com/corsix/fast-crc32.
    
    For smaller buffer size, it was observed to be slightly slower, thus only
    for input size >8192 this logic is used, for smaller sizes otherwise the
    4x PMULL folding method is used along with scalar crc32 instructions for
    processing the remainder input size.
    
    On a MediaTek Dimensity 9400 Android device in termux environment,
    with normal checkasm seed 0 which picks random buffer size and max buffer 
size
    of 16kB, the data observed on Cortex X925, A720 and X4:
    X925 Before:
      crc_32_IEEE_LE_c:                  12762.0
      crc_32_IEEE_LE_crc:                  667.5 (19.11x)
      crc_32_IEEE_LE_pmull_eor3:           346.9 (26.30x)
    X925 After:
      crc_32_IEEE_LE_c:                  12707.6
      crc_32_IEEE_LE_crc:                  665.2 (19.10x)
      crc_32_IEEE_LE_pmull_eor3:           292.8 (41.90x)
    
    A720 Before:
      crc_32_IEEE_LE_c:                  23059.1
      crc_32_IEEE_LE_crc:                 1220.7 (18.89x)
      crc_32_IEEE_LE_pmull_eor3:          1198.9 (19.23x)
    A720 After:
      crc_32_IEEE_LE_c:                  23293.3
      crc_32_IEEE_LE_crc:                 1209.1 (19.26x)
      crc_32_IEEE_LE_pmull_eor3:          1150.4 (20.24x)
    
    X4 Before:
      crc_32_IEEE_LE_c:                  12405.5
      crc_32_IEEE_LE_crc:                  664.5 (18.67x)
      crc_32_IEEE_LE_pmull_eor3:           498.1 (24.90x)
    X4 After:
      crc_32_IEEE_LE_c:                  12457.2
      crc_32_IEEE_LE_crc:                  665.5 (18.72x)
      crc_32_IEEE_LE_pmull_eor3:           468.8 (26.57x)
    
    So it seems to work well on high performance core like X925, and results in 
about
    20% better performance, while having tiny gains on other cores.
    
    Testing for input size of 160 kB after modifying the checkasm crc test to
    have buffer size increased to 160kB and always using full capacity instead 
of
    a random size results in below observations:
    X925 Before:
      crc_32_IEEE_LE_c:                 210177.1
      crc_32_IEEE_LE_crc:                10313.7 (20.35x)
      crc_32_IEEE_LE_pmull_eor3:          6580.9 (31.83x)
    X925 After:
      crc_32_IEEE_LE_c:                 210869.3
      crc_32_IEEE_LE_crc:                10304.8 (20.36x)
      crc_32_IEEE_LE_pmull_eor3:          3098.5 (68.05x)
    
    A720 Before:
      crc_32_IEEE_LE_c:                 387502.5
      crc_32_IEEE_LE_crc:                19196.7 (19.54x)
      crc_32_IEEE_LE_pmull_eor3:         18717.1 (20.63x)
    A720 After:
      crc_32_IEEE_LE_c:                 392090.8
      crc_32_IEEE_LE_crc:                19795.1 (18.68x)
      crc_32_IEEE_LE_pmull_eor3:         14971.4 (24.97x)
    
    X4 Before:
      crc_32_IEEE_LE_c:                 196232.0
      crc_32_IEEE_LE_crc:                10378.7 (18.68x)
      crc_32_IEEE_LE_pmull_eor3:          7742.0 (25.29x)
    X4 After:
      crc_32_IEEE_LE_c:                 199632.9
      crc_32_IEEE_LE_crc:                10495.8 (18.32x)
      crc_32_IEEE_LE_pmull_eor3:          5448.9 (24.69x)
    
    Seems to result in about 2x gains on X925, 25% on A70 and 40% on X4.
    In general the performance gains depends on the CPU Core and input size,
    and this optimization benefits large input size especially on high 
performance
    cores like X925 and Apple M series.
---
 libavutil/aarch64/crc.S | 288 +++++++++++++++++++++++++++++++++++++++++++++++-
 libavutil/aarch64/crc.h |  29 ++++-
 2 files changed, 311 insertions(+), 6 deletions(-)

diff --git a/libavutil/aarch64/crc.S b/libavutil/aarch64/crc.S
index 6ff109aa71..1dc02c7d86 100644
--- a/libavutil/aarch64/crc.S
+++ b/libavutil/aarch64/crc.S
@@ -122,13 +122,13 @@ endconst
 
 // assume Vfold is v16 and v0 is filled with 0
 // uses v17 as temp
-.macro FOLD_64_TO_32 le, Vconst
+.macro FOLD_64_TO_32 le, Vconst, output_reg
 .if ! \le
         pmull           v17.1q, v16.1d, \Vconst\().1d
         pmull2          v17.1q, v17.2d, \Vconst\().2d
         eor             v16.16b, v16.16b, v17.16b
-        fmov            w0, s16
-        rev             w0, w0
+        fmov            \output_reg, s16
+        rev             \output_reg, \output_reg
 .else
         mov             v16.s[0], wzr
         pmull           v17.1q, v16.1d, \Vconst\().1d
@@ -136,7 +136,7 @@ endconst
         ext             \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8
         pmull           v17.1q, v17.1d, \Vconst\().1d
         eor             v16.16b, v16.16b, v17.16b
-        mov             w0, v16.s[2]
+        mov             \output_reg, v16.s[2]
 .endif
 .endm
 
@@ -259,7 +259,7 @@ function ff_crc_neon_pmull, export=1
 
 7: // reduce 64 to 32
         ldr             q3, [x0, #(CTX_OFFSET + 48)]
-        FOLD_64_TO_32   \le, v3
+        FOLD_64_TO_32   \le, v3, w0
         ret
 
 8: // less than 64 bytes
@@ -329,6 +329,284 @@ endfunc
 crc_fn_template 0
 crc_fn_template 1
 
+#if HAVE_ARM_CRC
+ENABLE_ARM_CRC
+// uses x7, x6, x4 and v31 as temporary registers.
+.macro CRC_SHIFT crc_reg, nbits_reg, output_neon_reg
+        mov             x7, #-2
+1:
+        and             x6, x\nbits_reg, #1
+        lsr             x\nbits_reg, x\nbits_reg, #1
+        sub             x\nbits_reg, x\nbits_reg, #16
+        add             x7, x6, x7, lsl #1
+        cmp             x\nbits_reg, #191
+        b.hi            1b
+        mvn             x6, x7
+        mov             w7, #-2147483648
+        lsr             w7, w7, w\nbits_reg
+        lsr             x\nbits_reg, x\nbits_reg, #5
+2:
+        subs            x\nbits_reg, x\nbits_reg, #1
+        crc32w          w7, w7, wzr
+        b.ne            2b
+        lsr             x4, x6, #1
+        cbz             x4, 4f
+        and             w\nbits_reg, w6, #1
+3:
+        fmov            s31, w7
+        pmull           v31.8h, v31.8b, v31.8b
+        fmov            x7, d31
+        lsl             x7, x7, x\nbits_reg
+        and             w\nbits_reg, w4, #1
+        crc32x          w7, wzr, x7
+        lsr             x4, x4, #1
+        cbnz            x4, 3b
+4:
+        fmov            s\output_neon_reg, w\crc_reg
+        fmov            s31, w7
+        pmull           v\output_neon_reg\().1q, v\output_neon_reg\().1d, 
v31.1d
+.endm
+
+// This routine is based on zlib-ng's implementation based on
+// 
https://github.com/zlib-ng/zlib-ng/commit/b5638a82e726c9941bd3a1e7a23182d038eb831f
+// https://github.com/corsix/fast-crc32
+function ff_crc32_pmull_eor3_aarch64, export=1
+        neg             x8, x2
+        tst             x8, #0xf
+        b.eq            4f // buf 16b aligned
+        cbz             x3, 11f
+        tbz             w8, #0, 1f
+        ldrb            w9, [x2], #1
+        sub             x3, x3, #1
+        crc32b          w1, w1, w9
+1:
+        tbz             w8, #1, 2f
+        subs            x9, x3, #2
+        b.lo            9f
+        mov             x3, x9
+        ldrh            w10, [x2], #2
+        crc32h          w1, w1, w10
+2:
+        tbz             w8, #2, 3f
+        subs            x9, x3, #4
+        b.lo            9f
+        mov             x3, x9
+        ldr             w10, [x2], #4
+        crc32w          w1, w1, w10
+3:
+        tbz             w8, #3, 4f
+        subs            x9, x3, #8
+        b.lo            9f
+        mov             x3, x9
+        ldr             x10, [x2], #8
+        crc32x          w1, w1, x10
+
+4: // buf 16b aligned
+        cmp             x3, #2, lsl #12 // 8192
+        b.lo            12f // 4x fold
+
+        mov             x8, #-6148914691236517206
+        ldur            q17, [x0, #(CTX_OFFSET + 0)]
+        movk            x8, #43691
+        mov             w10, wzr
+        umulh           x8, x3, x8
+        ldur            q18, [x0, #(CTX_OFFSET + 16)]
+        mov             w11, wzr
+        lsr             x9, x8, #7
+        add             x8, x9, x9, lsl #1
+        lsl             x12, x9, #4
+        lsl             x13, x9, #5
+        lsl             x8, x8, #4
+        add             x16, x2, x12
+        sub             x15, x16, #32
+        sub             x17, x3, x8
+        add             x8, x2, x8
+        ldp             q6, q16, [x8]
+        ldr             q0, [x8, #128]
+        ldp             q4, q7, [x8, #32]
+        sub             x3, x17, #144
+        ldp             q3, q5, [x8, #64]
+        ldp             q2, q1, [x8, #96]
+        add             x8, x8, #144
+
+5: // 192b hybrid fold
+        pmull           v19.1q, v6.1d, v17.1d
+        ldp             q26, q28, [x8]
+        pmull2          v6.1q, v6.2d, v17.2d
+        ldp             x16, x6, [x2]
+        pmull           v20.1q, v16.1d, v17.1d
+        add             x17, x2, x12
+        add             x7, x2, x13
+        pmull2          v16.1q, v16.2d, v17.2d
+        add             x2, x2, #16
+        sub             x3, x3, #144
+        pmull           v21.1q, v4.1d, v17.1d
+        ldp             x4, x17, [x17]
+        pmull2          v4.1q, v4.2d, v17.2d
+        eor3            v6.16b, v6.16b, v19.16b, v26.16b
+        crc32x          w10, w10, x4
+        pmull           v22.1q, v7.1d, v17.1d
+        ldp             x5, x7, [x7]
+        pmull2          v7.1q, v7.2d, v17.2d
+        eor3            v16.16b, v16.16b, v20.16b, v28.16b
+        crc32x          w11, w11, x5
+        ldp             q26, q20, [x8, #32]
+        cmp             x2, x15
+        pmull           v23.1q, v3.1d, v17.1d
+        crc32x          w16, w1, x16
+        crc32x          w1, w16, x6
+        pmull2          v3.1q, v3.2d, v17.2d
+        crc32x          w10, w10, x17
+        crc32x          w11, w11, x7
+        pmull           v24.1q, v5.1d, v17.1d
+        eor3            v4.16b, v4.16b, v21.16b, v26.16b
+        eor3            v7.16b, v7.16b, v22.16b, v20.16b
+        pmull2          v5.1q, v5.2d, v17.2d
+        ldp             q21, q26, [x8, #64]
+        pmull           v25.1q, v2.1d, v17.1d
+        ldp             q28, q20, [x8, #96]
+        pmull2          v2.1q, v2.2d, v17.2d
+        ldr             q22, [x8, #128]
+        add             x8, x8, #144
+        pmull           v27.1q, v1.1d, v17.1d
+        eor3            v3.16b, v3.16b, v23.16b, v21.16b
+        pmull2          v1.1q, v1.2d, v17.2d
+        eor3            v5.16b, v5.16b, v24.16b, v26.16b
+        pmull           v19.1q, v0.1d, v17.1d
+        pmull2          v0.1q, v0.2d, v17.2d
+        eor3            v2.16b, v2.16b, v25.16b, v28.16b
+        eor3            v1.16b, v1.16b, v27.16b, v20.16b
+        eor3            v0.16b, v0.16b, v19.16b, v22.16b
+        b.ls            5b // 192b hybrid fold
+
+        add             x17, x2, x13
+        mov             x13, #-33
+        add             x12, x2, x12
+        ldur            q22, [x0, #(CTX_OFFSET + 32)]
+        ldp             x16, x15, [x2]
+        ldur            q23, [x0, #(CTX_OFFSET + 48)]
+        ldp             x14, x0, [x12]
+        crc32x          w16, w1, x16
+        crc32x          w10, w10, x14
+        crc32x          w12, w16, x15
+        ldp             x7, x17, [x17]
+        crc32x          w14, w11, x7
+        crc32x          w11, w10, x0
+        crc32x          w10, w14, x17
+        mov             w14, #1408
+        madd            x14, x9, x14, x13
+
+        pmull           v20.1q, v6.1d, v18.1d
+        pmull2          v6.1q, v6.2d, v18.2d
+        pmull           v21.1q, v5.1d, v18.1d
+        pmull2          v5.1q, v5.2d, v18.2d
+        eor3            v6.16b, v6.16b, v20.16b, v16.16b
+        pmull           v20.1q, v7.1d, v18.1d
+        pmull2          v7.1q, v7.2d, v18.2d
+        eor3            v2.16b, v5.16b, v21.16b, v2.16b
+        pmull           v16.1q, v6.1d, v18.1d
+        pmull2          v6.1q, v6.2d, v18.2d
+        pmull           v5.1q, v2.1d, v22.1d
+        eor3            v3.16b, v7.16b, v20.16b, v3.16b
+        pmull2          v2.1q, v2.2d, v22.2d
+
+        CRC_SHIFT       12, 14, 29
+        mov             w14, #1280
+        eor3            v4.16b, v6.16b, v16.16b, v4.16b
+        pmull           v16.1q, v1.1d, v18.1d
+        pmull2          v1.1q, v1.2d, v18.2d
+        pmull           v6.1q, v4.1d, v22.1d
+        pmull2          v4.1q, v4.2d, v22.2d
+        madd            x14, x9, x14, x13
+        CRC_SHIFT       11, 14, 28
+        eor3            v3.16b, v4.16b, v6.16b, v3.16b
+        eor3            v4.16b, v1.16b, v16.16b, v0.16b
+        pmull           v0.1q, v3.1d, v23.1d
+        eor3            v2.16b, v2.16b, v5.16b, v4.16b
+        pmull2          v1.1q, v3.2d, v23.2d
+        mov             w12, #1152
+        madd            x9, x9, x12, x13
+        eor3            v0.16b, v1.16b, v0.16b, v2.16b
+        CRC_SHIFT       10, 9, 27
+        fmov            x9, d0
+        crc32x          w9, wzr, x9
+        eor3            v1.16b, v27.16b, v28.16b, v29.16b
+        dup             v2.2d, v0.d[1]
+        eor             v1.16b, v1.16b, v2.16b
+        mov             x2, x8
+        fmov            x10, d1
+        crc32x          w1, w9, x10
+
+6: // process tail (<192 bytes)
+        bic             x5, x3, #15
+        and             x3, x3, #0xf
+        cbz             x5, 8f
+7:
+        ldp             x6, x7, [x2], #16
+        subs            x5, x5, #16
+        crc32x          w1, w1, x6
+        crc32x          w1, w1, x7
+        b.ne            7b
+8:
+        tbz             x3, #3, 9f
+        ldr             x10, [x2], #8
+        sub             x3, x3, #8
+        crc32x          w1, w1, x10
+9:
+        cbz             x3, 11f
+10:
+        ldrb            w10, [x2], #1
+        subs            x3, x3, #1
+        crc32b          w1, w1, w10
+        b.ne            10b
+11:
+        mov             w0, w1
+        ret
+
+12: // 4x fold
+        cmp             x3, #192
+        b.lo            6b // process tail (<192 bytes)
+
+        ldur            q3, [x0, #(CTX_OFFSET + 64)]
+        movi            v0.2d, #0
+        fmov            s1, w1
+        ld1             {v16.16b-v19.16b}, [x2], #64
+        sub             x3, x3, #64
+        eor             v16.16b, v16.16b, v1.16b
+        ldur            q25, [x0, #(CTX_OFFSET + 80)]
+
+        bic             x5, x3, #63
+        and             x3, x3, #0x3f
+
+13: // fold 4x loop
+        ld1             {v20.16b-v23.16b}, [x2], #64
+        pmull           v4.1q, v16.1d, v3.1d
+        pmull           v5.1q, v17.1d, v3.1d
+        pmull           v6.1q, v18.1d, v3.1d
+        pmull           v7.1q, v19.1d, v3.1d
+        pmull2          v16.1q, v16.2d, v3.2d
+        pmull2          v17.1q, v17.2d, v3.2d
+        pmull2          v18.1q, v18.2d, v3.2d
+        pmull2          v19.1q, v19.2d, v3.2d
+        subs            x5, x5, #64
+        eor3            v16.16b, v16.16b, v4.16b, v20.16b
+        eor3            v17.16b, v17.16b, v5.16b, v21.16b
+        eor3            v18.16b, v18.16b, v6.16b, v22.16b
+        eor3            v19.16b, v19.16b, v7.16b, v23.16b
+        b.ne            13b // fold 4x loop
+
+        FOLD_SINGLE     v16, v25, v17, v4
+        ldur            q26, [x0, #(CTX_OFFSET + 96)]
+        FOLD_SINGLE     v16, v25, v18, v4
+        ldur            q27, [x0, #(CTX_OFFSET + 112)]
+        FOLD_SINGLE     v16, v25, v19, v4
+        FOLD_128_TO_64  1, v26
+        FOLD_64_TO_32   1, v27, w1
+        b               6b // process tail (<192 bytes)
+endfunc
+DISABLE_ARM_CRC
+#endif
+
 DISABLE_PMULL
 DISABLE_EOR3
 #endif
diff --git a/libavutil/aarch64/crc.h b/libavutil/aarch64/crc.h
index e31625606a..90c7a834d0 100644
--- a/libavutil/aarch64/crc.h
+++ b/libavutil/aarch64/crc.h
@@ -52,6 +52,7 @@ enum {
     CRC_C    = 0,
     PMULL_BE,
     PMULL_LE,
+    CRC32_PMULL_LE,
 };
 
 static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = {
@@ -149,6 +150,24 @@ static inline void crc_init_aarch64(AVCRC *ctx, int le, 
int bits, uint32_t poly,
         AV_WN64(dst + 56, poly_ | (1ULL << 32));
     }
 }
+
+#if HAVE_ARM_CRC
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc32_pmull_eor3_aarch64(const AVCRC *ctx, uint32_t crc, const 
uint8_t *buffer,
+                                     size_t length);
+FF_VISIBILITY_POP_HIDDEN
+static const AVCRC crc_table_crc32_pmull[] = {
+        CRC32_PMULL_LE,
+        0x26b70c3d, 0x0, 0x3f41287a, 0x0,
+        0xae689191, 0x0, 0xccaa009e, 0x0,
+        0xf1da05aa, 0x0, 0x81256527, 0x0,
+        0x8f352d95, 0x0, 0x1d9513d7, 0x0,
+        0x54442bd4, 0x1, 0xc6e41596, 0x1,
+        0x751997d0, 0x1, 0xccaa009e, 0x0,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+};
+#endif
 #endif
 
 static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
@@ -169,13 +188,16 @@ static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, 
uint32_t crc,
 {
     switch (ctx[0]) {
 #if HAVE_PMULL && HAVE_EOR3
+#if HAVE_ARM_CRC
+    case CRC32_PMULL_LE: return ff_crc32_pmull_eor3_aarch64(ctx, crc, buffer, 
length);
+#endif
     case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length);
     case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length);
 #endif
 #if HAVE_ARM_CRC
     case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, 
length);
 #endif
-    default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and 
AV_CRC_32_IEEE_LE arch-specific CRC code");
+    default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE, CRC32_PMULL_LE, 
and AV_CRC_32_IEEE_LE arch-specific CRC code");
     }
     return 0;
 }
@@ -185,6 +207,11 @@ static inline const AVCRC 
*ff_crc_get_table_aarch64(AVCRCId crc_id)
     int cpu_flags = av_get_cpu_flags();
 #if HAVE_PMULL && HAVE_EOR3
     if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) {
+#if HAVE_ARM_CRC
+        if (crc_id == AV_CRC_32_IEEE_LE && have_arm_crc(cpu_flags)) {
+            return crc_table_crc32_pmull;
+        }
+#endif
         return crc_table_pmull[crc_id];
     }
 #endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 01/02: avutil/crc: add aarch64 hybrid crc32 NEON PMULL+EOR SIMD implementation

Reply via email to