This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 915bac7bdc89317b85dc19e9cf6e7aed99be2e68 Author: Shreesh Adiga <[email protected]> AuthorDate: Mon Mar 23 17:25:00 2026 +0530 Commit: Martin Storsjö <[email protected]> CommitDate: Thu Jul 2 09:03:25 2026 +0000 avutil/crc: add aarch64 hybrid crc32 NEON PMULL+EOR SIMD implementation Adding crc32 specialization for aarch64 which uses both PMULL and crc32 instructions to perform 192 bytes fold in one iteration, performing 9x PMULL and 6 crc32 in one loop iteration, obtaining higher performance for large inputs >8kB. This approach is based on zlib-ng implementation which is also described at https://github.com/corsix/fast-crc32. For smaller buffer size, it was observed to be slightly slower, thus only for input size >8192 this logic is used, for smaller sizes otherwise the 4x PMULL folding method is used along with scalar crc32 instructions for processing the remainder input size. On a MediaTek Dimensity 9400 Android device in termux environment, with normal checkasm seed 0 which picks random buffer size and max buffer size of 16kB, the data observed on Cortex X925, A720 and X4: X925 Before: crc_32_IEEE_LE_c: 12762.0 crc_32_IEEE_LE_crc: 667.5 (19.11x) crc_32_IEEE_LE_pmull_eor3: 346.9 (26.30x) X925 After: crc_32_IEEE_LE_c: 12707.6 crc_32_IEEE_LE_crc: 665.2 (19.10x) crc_32_IEEE_LE_pmull_eor3: 292.8 (41.90x) A720 Before: crc_32_IEEE_LE_c: 23059.1 crc_32_IEEE_LE_crc: 1220.7 (18.89x) crc_32_IEEE_LE_pmull_eor3: 1198.9 (19.23x) A720 After: crc_32_IEEE_LE_c: 23293.3 crc_32_IEEE_LE_crc: 1209.1 (19.26x) crc_32_IEEE_LE_pmull_eor3: 1150.4 (20.24x) X4 Before: crc_32_IEEE_LE_c: 12405.5 crc_32_IEEE_LE_crc: 664.5 (18.67x) crc_32_IEEE_LE_pmull_eor3: 498.1 (24.90x) X4 After: crc_32_IEEE_LE_c: 12457.2 crc_32_IEEE_LE_crc: 665.5 (18.72x) crc_32_IEEE_LE_pmull_eor3: 468.8 (26.57x) So it seems to work well on high performance core like X925, and results in about 20% better performance, while having tiny gains on other cores. Testing for input size of 160 kB after modifying the checkasm crc test to have buffer size increased to 160kB and always using full capacity instead of a random size results in below observations: X925 Before: crc_32_IEEE_LE_c: 210177.1 crc_32_IEEE_LE_crc: 10313.7 (20.35x) crc_32_IEEE_LE_pmull_eor3: 6580.9 (31.83x) X925 After: crc_32_IEEE_LE_c: 210869.3 crc_32_IEEE_LE_crc: 10304.8 (20.36x) crc_32_IEEE_LE_pmull_eor3: 3098.5 (68.05x) A720 Before: crc_32_IEEE_LE_c: 387502.5 crc_32_IEEE_LE_crc: 19196.7 (19.54x) crc_32_IEEE_LE_pmull_eor3: 18717.1 (20.63x) A720 After: crc_32_IEEE_LE_c: 392090.8 crc_32_IEEE_LE_crc: 19795.1 (18.68x) crc_32_IEEE_LE_pmull_eor3: 14971.4 (24.97x) X4 Before: crc_32_IEEE_LE_c: 196232.0 crc_32_IEEE_LE_crc: 10378.7 (18.68x) crc_32_IEEE_LE_pmull_eor3: 7742.0 (25.29x) X4 After: crc_32_IEEE_LE_c: 199632.9 crc_32_IEEE_LE_crc: 10495.8 (18.32x) crc_32_IEEE_LE_pmull_eor3: 5448.9 (24.69x) Seems to result in about 2x gains on X925, 25% on A70 and 40% on X4. In general the performance gains depends on the CPU Core and input size, and this optimization benefits large input size especially on high performance cores like X925 and Apple M series. --- libavutil/aarch64/crc.S | 288 +++++++++++++++++++++++++++++++++++++++++++++++- libavutil/aarch64/crc.h | 29 ++++- 2 files changed, 311 insertions(+), 6 deletions(-) diff --git a/libavutil/aarch64/crc.S b/libavutil/aarch64/crc.S index 6ff109aa71..1dc02c7d86 100644 --- a/libavutil/aarch64/crc.S +++ b/libavutil/aarch64/crc.S @@ -122,13 +122,13 @@ endconst // assume Vfold is v16 and v0 is filled with 0 // uses v17 as temp -.macro FOLD_64_TO_32 le, Vconst +.macro FOLD_64_TO_32 le, Vconst, output_reg .if ! \le pmull v17.1q, v16.1d, \Vconst\().1d pmull2 v17.1q, v17.2d, \Vconst\().2d eor v16.16b, v16.16b, v17.16b - fmov w0, s16 - rev w0, w0 + fmov \output_reg, s16 + rev \output_reg, \output_reg .else mov v16.s[0], wzr pmull v17.1q, v16.1d, \Vconst\().1d @@ -136,7 +136,7 @@ endconst ext \Vconst\().16b, \Vconst\().16b, \Vconst\().16b, #8 pmull v17.1q, v17.1d, \Vconst\().1d eor v16.16b, v16.16b, v17.16b - mov w0, v16.s[2] + mov \output_reg, v16.s[2] .endif .endm @@ -259,7 +259,7 @@ function ff_crc_neon_pmull, export=1 7: // reduce 64 to 32 ldr q3, [x0, #(CTX_OFFSET + 48)] - FOLD_64_TO_32 \le, v3 + FOLD_64_TO_32 \le, v3, w0 ret 8: // less than 64 bytes @@ -329,6 +329,284 @@ endfunc crc_fn_template 0 crc_fn_template 1 +#if HAVE_ARM_CRC +ENABLE_ARM_CRC +// uses x7, x6, x4 and v31 as temporary registers. +.macro CRC_SHIFT crc_reg, nbits_reg, output_neon_reg + mov x7, #-2 +1: + and x6, x\nbits_reg, #1 + lsr x\nbits_reg, x\nbits_reg, #1 + sub x\nbits_reg, x\nbits_reg, #16 + add x7, x6, x7, lsl #1 + cmp x\nbits_reg, #191 + b.hi 1b + mvn x6, x7 + mov w7, #-2147483648 + lsr w7, w7, w\nbits_reg + lsr x\nbits_reg, x\nbits_reg, #5 +2: + subs x\nbits_reg, x\nbits_reg, #1 + crc32w w7, w7, wzr + b.ne 2b + lsr x4, x6, #1 + cbz x4, 4f + and w\nbits_reg, w6, #1 +3: + fmov s31, w7 + pmull v31.8h, v31.8b, v31.8b + fmov x7, d31 + lsl x7, x7, x\nbits_reg + and w\nbits_reg, w4, #1 + crc32x w7, wzr, x7 + lsr x4, x4, #1 + cbnz x4, 3b +4: + fmov s\output_neon_reg, w\crc_reg + fmov s31, w7 + pmull v\output_neon_reg\().1q, v\output_neon_reg\().1d, v31.1d +.endm + +// This routine is based on zlib-ng's implementation based on +// https://github.com/zlib-ng/zlib-ng/commit/b5638a82e726c9941bd3a1e7a23182d038eb831f +// https://github.com/corsix/fast-crc32 +function ff_crc32_pmull_eor3_aarch64, export=1 + neg x8, x2 + tst x8, #0xf + b.eq 4f // buf 16b aligned + cbz x3, 11f + tbz w8, #0, 1f + ldrb w9, [x2], #1 + sub x3, x3, #1 + crc32b w1, w1, w9 +1: + tbz w8, #1, 2f + subs x9, x3, #2 + b.lo 9f + mov x3, x9 + ldrh w10, [x2], #2 + crc32h w1, w1, w10 +2: + tbz w8, #2, 3f + subs x9, x3, #4 + b.lo 9f + mov x3, x9 + ldr w10, [x2], #4 + crc32w w1, w1, w10 +3: + tbz w8, #3, 4f + subs x9, x3, #8 + b.lo 9f + mov x3, x9 + ldr x10, [x2], #8 + crc32x w1, w1, x10 + +4: // buf 16b aligned + cmp x3, #2, lsl #12 // 8192 + b.lo 12f // 4x fold + + mov x8, #-6148914691236517206 + ldur q17, [x0, #(CTX_OFFSET + 0)] + movk x8, #43691 + mov w10, wzr + umulh x8, x3, x8 + ldur q18, [x0, #(CTX_OFFSET + 16)] + mov w11, wzr + lsr x9, x8, #7 + add x8, x9, x9, lsl #1 + lsl x12, x9, #4 + lsl x13, x9, #5 + lsl x8, x8, #4 + add x16, x2, x12 + sub x15, x16, #32 + sub x17, x3, x8 + add x8, x2, x8 + ldp q6, q16, [x8] + ldr q0, [x8, #128] + ldp q4, q7, [x8, #32] + sub x3, x17, #144 + ldp q3, q5, [x8, #64] + ldp q2, q1, [x8, #96] + add x8, x8, #144 + +5: // 192b hybrid fold + pmull v19.1q, v6.1d, v17.1d + ldp q26, q28, [x8] + pmull2 v6.1q, v6.2d, v17.2d + ldp x16, x6, [x2] + pmull v20.1q, v16.1d, v17.1d + add x17, x2, x12 + add x7, x2, x13 + pmull2 v16.1q, v16.2d, v17.2d + add x2, x2, #16 + sub x3, x3, #144 + pmull v21.1q, v4.1d, v17.1d + ldp x4, x17, [x17] + pmull2 v4.1q, v4.2d, v17.2d + eor3 v6.16b, v6.16b, v19.16b, v26.16b + crc32x w10, w10, x4 + pmull v22.1q, v7.1d, v17.1d + ldp x5, x7, [x7] + pmull2 v7.1q, v7.2d, v17.2d + eor3 v16.16b, v16.16b, v20.16b, v28.16b + crc32x w11, w11, x5 + ldp q26, q20, [x8, #32] + cmp x2, x15 + pmull v23.1q, v3.1d, v17.1d + crc32x w16, w1, x16 + crc32x w1, w16, x6 + pmull2 v3.1q, v3.2d, v17.2d + crc32x w10, w10, x17 + crc32x w11, w11, x7 + pmull v24.1q, v5.1d, v17.1d + eor3 v4.16b, v4.16b, v21.16b, v26.16b + eor3 v7.16b, v7.16b, v22.16b, v20.16b + pmull2 v5.1q, v5.2d, v17.2d + ldp q21, q26, [x8, #64] + pmull v25.1q, v2.1d, v17.1d + ldp q28, q20, [x8, #96] + pmull2 v2.1q, v2.2d, v17.2d + ldr q22, [x8, #128] + add x8, x8, #144 + pmull v27.1q, v1.1d, v17.1d + eor3 v3.16b, v3.16b, v23.16b, v21.16b + pmull2 v1.1q, v1.2d, v17.2d + eor3 v5.16b, v5.16b, v24.16b, v26.16b + pmull v19.1q, v0.1d, v17.1d + pmull2 v0.1q, v0.2d, v17.2d + eor3 v2.16b, v2.16b, v25.16b, v28.16b + eor3 v1.16b, v1.16b, v27.16b, v20.16b + eor3 v0.16b, v0.16b, v19.16b, v22.16b + b.ls 5b // 192b hybrid fold + + add x17, x2, x13 + mov x13, #-33 + add x12, x2, x12 + ldur q22, [x0, #(CTX_OFFSET + 32)] + ldp x16, x15, [x2] + ldur q23, [x0, #(CTX_OFFSET + 48)] + ldp x14, x0, [x12] + crc32x w16, w1, x16 + crc32x w10, w10, x14 + crc32x w12, w16, x15 + ldp x7, x17, [x17] + crc32x w14, w11, x7 + crc32x w11, w10, x0 + crc32x w10, w14, x17 + mov w14, #1408 + madd x14, x9, x14, x13 + + pmull v20.1q, v6.1d, v18.1d + pmull2 v6.1q, v6.2d, v18.2d + pmull v21.1q, v5.1d, v18.1d + pmull2 v5.1q, v5.2d, v18.2d + eor3 v6.16b, v6.16b, v20.16b, v16.16b + pmull v20.1q, v7.1d, v18.1d + pmull2 v7.1q, v7.2d, v18.2d + eor3 v2.16b, v5.16b, v21.16b, v2.16b + pmull v16.1q, v6.1d, v18.1d + pmull2 v6.1q, v6.2d, v18.2d + pmull v5.1q, v2.1d, v22.1d + eor3 v3.16b, v7.16b, v20.16b, v3.16b + pmull2 v2.1q, v2.2d, v22.2d + + CRC_SHIFT 12, 14, 29 + mov w14, #1280 + eor3 v4.16b, v6.16b, v16.16b, v4.16b + pmull v16.1q, v1.1d, v18.1d + pmull2 v1.1q, v1.2d, v18.2d + pmull v6.1q, v4.1d, v22.1d + pmull2 v4.1q, v4.2d, v22.2d + madd x14, x9, x14, x13 + CRC_SHIFT 11, 14, 28 + eor3 v3.16b, v4.16b, v6.16b, v3.16b + eor3 v4.16b, v1.16b, v16.16b, v0.16b + pmull v0.1q, v3.1d, v23.1d + eor3 v2.16b, v2.16b, v5.16b, v4.16b + pmull2 v1.1q, v3.2d, v23.2d + mov w12, #1152 + madd x9, x9, x12, x13 + eor3 v0.16b, v1.16b, v0.16b, v2.16b + CRC_SHIFT 10, 9, 27 + fmov x9, d0 + crc32x w9, wzr, x9 + eor3 v1.16b, v27.16b, v28.16b, v29.16b + dup v2.2d, v0.d[1] + eor v1.16b, v1.16b, v2.16b + mov x2, x8 + fmov x10, d1 + crc32x w1, w9, x10 + +6: // process tail (<192 bytes) + bic x5, x3, #15 + and x3, x3, #0xf + cbz x5, 8f +7: + ldp x6, x7, [x2], #16 + subs x5, x5, #16 + crc32x w1, w1, x6 + crc32x w1, w1, x7 + b.ne 7b +8: + tbz x3, #3, 9f + ldr x10, [x2], #8 + sub x3, x3, #8 + crc32x w1, w1, x10 +9: + cbz x3, 11f +10: + ldrb w10, [x2], #1 + subs x3, x3, #1 + crc32b w1, w1, w10 + b.ne 10b +11: + mov w0, w1 + ret + +12: // 4x fold + cmp x3, #192 + b.lo 6b // process tail (<192 bytes) + + ldur q3, [x0, #(CTX_OFFSET + 64)] + movi v0.2d, #0 + fmov s1, w1 + ld1 {v16.16b-v19.16b}, [x2], #64 + sub x3, x3, #64 + eor v16.16b, v16.16b, v1.16b + ldur q25, [x0, #(CTX_OFFSET + 80)] + + bic x5, x3, #63 + and x3, x3, #0x3f + +13: // fold 4x loop + ld1 {v20.16b-v23.16b}, [x2], #64 + pmull v4.1q, v16.1d, v3.1d + pmull v5.1q, v17.1d, v3.1d + pmull v6.1q, v18.1d, v3.1d + pmull v7.1q, v19.1d, v3.1d + pmull2 v16.1q, v16.2d, v3.2d + pmull2 v17.1q, v17.2d, v3.2d + pmull2 v18.1q, v18.2d, v3.2d + pmull2 v19.1q, v19.2d, v3.2d + subs x5, x5, #64 + eor3 v16.16b, v16.16b, v4.16b, v20.16b + eor3 v17.16b, v17.16b, v5.16b, v21.16b + eor3 v18.16b, v18.16b, v6.16b, v22.16b + eor3 v19.16b, v19.16b, v7.16b, v23.16b + b.ne 13b // fold 4x loop + + FOLD_SINGLE v16, v25, v17, v4 + ldur q26, [x0, #(CTX_OFFSET + 96)] + FOLD_SINGLE v16, v25, v18, v4 + ldur q27, [x0, #(CTX_OFFSET + 112)] + FOLD_SINGLE v16, v25, v19, v4 + FOLD_128_TO_64 1, v26 + FOLD_64_TO_32 1, v27, w1 + b 6b // process tail (<192 bytes) +endfunc +DISABLE_ARM_CRC +#endif + DISABLE_PMULL DISABLE_EOR3 #endif diff --git a/libavutil/aarch64/crc.h b/libavutil/aarch64/crc.h index e31625606a..90c7a834d0 100644 --- a/libavutil/aarch64/crc.h +++ b/libavutil/aarch64/crc.h @@ -52,6 +52,7 @@ enum { CRC_C = 0, PMULL_BE, PMULL_LE, + CRC32_PMULL_LE, }; static const AVCRC crc_table_pmull[AV_CRC_MAX][17] = { @@ -149,6 +150,24 @@ static inline void crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, AV_WN64(dst + 56, poly_ | (1ULL << 32)); } } + +#if HAVE_ARM_CRC +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc32_pmull_eor3_aarch64(const AVCRC *ctx, uint32_t crc, const uint8_t *buffer, + size_t length); +FF_VISIBILITY_POP_HIDDEN +static const AVCRC crc_table_crc32_pmull[] = { + CRC32_PMULL_LE, + 0x26b70c3d, 0x0, 0x3f41287a, 0x0, + 0xae689191, 0x0, 0xccaa009e, 0x0, + 0xf1da05aa, 0x0, 0x81256527, 0x0, + 0x8f352d95, 0x0, 0x1d9513d7, 0x0, + 0x54442bd4, 0x1, 0xc6e41596, 0x1, + 0x751997d0, 0x1, 0xccaa009e, 0x0, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, +}; +#endif #endif static inline av_cold int ff_crc_init_aarch64(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) @@ -169,13 +188,16 @@ static inline uint32_t ff_crc_aarch64(const AVCRC *ctx, uint32_t crc, { switch (ctx[0]) { #if HAVE_PMULL && HAVE_EOR3 +#if HAVE_ARM_CRC + case CRC32_PMULL_LE: return ff_crc32_pmull_eor3_aarch64(ctx, crc, buffer, length); +#endif case PMULL_BE: return ff_crc_neon_pmull(ctx, crc, buffer, length); case PMULL_LE: return ff_crc_le_neon_pmull(ctx, crc, buffer, length); #endif #if HAVE_ARM_CRC case (AV_CRC_32_IEEE_LE + 1): return ff_crc32_aarch64(ctx, crc, buffer, length); #endif - default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE and AV_CRC_32_IEEE_LE arch-specific CRC code"); + default: av_unreachable("AARCH64 has PMULL_LE, PMULL_BE, CRC32_PMULL_LE, and AV_CRC_32_IEEE_LE arch-specific CRC code"); } return 0; } @@ -185,6 +207,11 @@ static inline const AVCRC *ff_crc_get_table_aarch64(AVCRCId crc_id) int cpu_flags = av_get_cpu_flags(); #if HAVE_PMULL && HAVE_EOR3 if (have_pmull(cpu_flags) && have_eor3(cpu_flags)) { +#if HAVE_ARM_CRC + if (crc_id == AV_CRC_32_IEEE_LE && have_arm_crc(cpu_flags)) { + return crc_table_crc32_pmull; + } +#endif return crc_table_pmull[crc_id]; } #endif _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
