This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit cfa3ceac7a8cb4f0c836b938a25d1579da154ed5 Author: Jun Zhao <[email protected]> AuthorDate: Wed May 13 22:56:00 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Sun Jun 7 23:29:33 2026 +0000 lavc/hevc: add aarch64 NEON for angular modes 10 and 26 Add NEON-optimized implementations for HEVC angular intra prediction modes 10 (pure horizontal) and 26 (pure vertical) at 8-bit depth. Mode 10 (Horizontal): - Broadcasts left[y] to fill each row using ld2r/ld4r for efficiency - Applies edge smoothing for luma blocks smaller than 32x32 Mode 26 (Vertical): - Copies top reference row to all output rows - Applies edge smoothing for luma blocks smaller than 32x32 Edge smoothing uses uhsub+usqadd to compute the filtered result directly in 8-bit, avoiding widening to 16-bit intermediates. The C pred_angular wrappers are made non-static with ff_ prefix to allow the NEON dispatch to fall back to C for modes not yet optimized. This will be reverted once all angular modes are implemented. Note: since pred_angular[] is a per-size function pointer (not per-mode), checkasm benchmarks will show '_neon' for all 33 modes even though only modes 10/26 are truly accelerated; unoptimized modes show ~1.0x speedup as they pass through the NEON wrapper to the C fallback with negligible overhead. Speedup over C on Apple M4 (checkasm --bench, 15-run average): Mode 10 (Horizontal): 4x4: 4.66x 8x8: 5.80x 16x16: 16.86x 32x32: 24.89x Mode 26 (Vertical): 4x4: 1.16x 8x8: 1.83x 16x16: 2.45x 32x32: 4.50x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcpred_init_aarch64.c | 35 ++++ libavcodec/aarch64/hevcpred_neon.S | 273 +++++++++++++++++++++++++++++ libavcodec/hevc/pred.c | 8 +- libavcodec/hevc/pred.h | 22 +++ libavcodec/hevc/pred_template.c | 24 +-- 5 files changed, 346 insertions(+), 16 deletions(-) diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c index fbc27c24a4..03fc5c490e 100644 --- a/libavcodec/aarch64/hevcpred_init_aarch64.c +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -67,6 +67,14 @@ void ff_hevc_ref_filter_3tap_32x32_8_neon(uint8_t *filtered_left, void ff_hevc_ref_filter_strong_8_neon(uint8_t *filtered_top, uint8_t *left, const uint8_t *top); +// Mode 10 and 26 +void ff_hevc_pred_angular_mode_10_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); +void ff_hevc_pred_angular_mode_26_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); + static void pred_dc_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx) @@ -89,6 +97,28 @@ static void pred_dc_neon(uint8_t *src, const uint8_t *top, } } +#define PRED_ANGULAR_NEON(IDX, LOG2) \ +static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride, \ + int c_idx, int mode) \ +{ \ + if (mode == 10) \ + ff_hevc_pred_angular_mode_10_8_neon(src, top, left, stride, \ + c_idx, LOG2); \ + else if (mode == 26) \ + ff_hevc_pred_angular_mode_26_8_neon(src, top, left, stride, \ + c_idx, LOG2); \ + else \ + ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode); \ +} + +PRED_ANGULAR_NEON(0, 2) +PRED_ANGULAR_NEON(1, 3) +PRED_ANGULAR_NEON(2, 4) +PRED_ANGULAR_NEON(3, 5) + +#undef PRED_ANGULAR_NEON + av_cold void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth) { int cpu_flags = av_get_cpu_flags(); @@ -107,5 +137,10 @@ av_cold void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth) hpc->ref_filter_3tap[1] = ff_hevc_ref_filter_3tap_16x16_8_neon; hpc->ref_filter_3tap[2] = ff_hevc_ref_filter_3tap_32x32_8_neon; hpc->ref_filter_strong = ff_hevc_ref_filter_strong_8_neon; + + hpc->pred_angular[0] = pred_angular_0_neon; + hpc->pred_angular[1] = pred_angular_1_neon; + hpc->pred_angular[2] = pred_angular_2_neon; + hpc->pred_angular[3] = pred_angular_3_neon; } } diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S index 7566275921..f21492318c 100644 --- a/libavcodec/aarch64/hevcpred_neon.S +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -1068,3 +1068,276 @@ function ff_hevc_ref_filter_strong_8_neon, export=1 endfunc .purgem strong_smooth + +// ============================================================================= +// Angular Prediction +// ============================================================================= + +// ----------------------------------------------------------------------------- +// pred_angular_mode_10_8: Horizontal prediction (mode 10) +// Caller must ensure top[-1] and left[-1] are valid (used for edge smoothing +// when c_idx == 0 and size < 32). +// Arguments: +// x0: src +// x1: top (only used for edge smoothing) +// x2: left +// x3: stride +// w4: c_idx +// w5: log2_size +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_10_8_neon, export=1 + cmp w5, #3 + b.lt .Lmode10_4x4 + b.eq .Lmode10_8x8 + cmp w5, #4 + b.eq .Lmode10_16x16 + + // --- size 32: 2 rows per iteration using ld2r --- + mov w7, #32 + add x8, x0, x3 // x8 = row 1 pointer + lsl x9, x3, #1 // x9 = stride * 2 +.Lmode10_32x32_row: + ld2r {v0.16b, v1.16b}, [x2], #2 + subs w7, w7, #2 + stp q0, q0, [x0] + stp q1, q1, [x8] + add x0, x0, x9 + add x8, x8, x9 + b.gt .Lmode10_32x32_row + // size 32 never does edge smoothing + ret + + // --- size 16: 2 rows per iteration using ld2r + dual pointer --- +.Lmode10_16x16: + mov x6, x0 // save src base + mov x7, x2 // save left base for edge smooth + add x8, x0, x3 // x8 = odd-row pointer + lsl x9, x3, #1 // x9 = stride * 2 + mov w10, #16 +.Lmode10_16x16_row: + ld2r {v0.16b, v1.16b}, [x2], #2 + subs w10, w10, #2 + st1 {v0.16b}, [x0], x9 + st1 {v1.16b}, [x8], x9 + b.gt .Lmode10_16x16_row + mov x2, x7 // restore left base + b .Lmode10_edge_smooth + + // --- size 8: ld4r to load 4 rows at once --- +.Lmode10_8x8: + mov x6, x0 // save src base + mov x7, x2 // save left base for edge smooth + add x8, x0, x3 // x8 = odd-row pointer + lsl x9, x3, #1 // x9 = stride * 2 + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], #4 + st1 {v0.8b}, [x0], x9 + st1 {v1.8b}, [x8], x9 + ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #4 + st1 {v2.8b}, [x0], x9 + st1 {v3.8b}, [x8], x9 + st1 {v4.8b}, [x0], x9 + st1 {v5.8b}, [x8], x9 + st1 {v6.8b}, [x0] + st1 {v7.8b}, [x8] + mov x2, x7 // restore left base + b .Lmode10_edge_smooth + + // --- size 4: ld4r to load all 4 rows at once --- +.Lmode10_4x4: + mov x6, x0 // save src base + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2] + str s0, [x0] + str s1, [x0, x3] + add x0, x0, x3, lsl #1 + str s2, [x0] + str s3, [x0, x3] + +.Lmode10_edge_smooth: + cbnz w4, .Lmode10_ret + + sub x7, x1, #1 // top - 1 (hoisted early) + mov x0, x6 // restore src base + + ld1r {v5.16b}, [x2] // left[0] broadcast + ld1r {v1.16b}, [x7] // top[-1] broadcast + + cmp w5, #3 + b.lt .Lmode10_smooth_4 + b.eq .Lmode10_smooth_8 + + // size 16 edge smoothing: out[x] = clip8(left[0] + (top[x] - top[-1]) / 2) + ldr q2, [x1] // top[0..15] + uhsub v2.16b, v2.16b, v1.16b // signed half-difference + usqadd v5.16b, v2.16b // sat_u8(left[0] + signed_delta) + st1 {v5.16b}, [x0] + ret + +.Lmode10_smooth_4: + ldr s2, [x1] // top[0..3] + uhsub v2.8b, v2.8b, v1.8b + usqadd v5.8b, v2.8b + st1 {v5.s}[0], [x0] + ret + +.Lmode10_smooth_8: + ldr d2, [x1] // top[0..7] + uhsub v2.8b, v2.8b, v1.8b + usqadd v5.8b, v2.8b + st1 {v5.8b}, [x0] + +.Lmode10_ret: + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_mode_26_8: Vertical prediction (mode 26) +// Caller must ensure top[-1] and left[-1] are valid (used for edge smoothing +// when c_idx == 0 and size < 32). +// Arguments: +// x0: src +// x1: top +// x2: left (only used for edge smoothing) +// x3: stride +// w4: c_idx +// w5: log2_size +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_26_8_neon, export=1 + mov x7, x0 // x7 = write pointer (preserve x0) + + cmp w5, #3 + b.lt .Lmode26_4x4 + b.eq .Lmode26_8x8 + cmp w5, #4 + b.eq .Lmode26_16x16 + // fall-through to 32x32 + + // --- size 32 --- + ldp q0, q1, [x1] // Load top[0..31] once + mov w9, #32 +.Lmode26_32x32_row: + subs w9, w9, #1 + st1 {v0.16b, v1.16b}, [x7], x3 + b.gt .Lmode26_32x32_row + b .Lmode26_edge_smooth + + // --- size 16 --- +.Lmode26_16x16: + ldr q0, [x1] // Load top[0..15] once + mov w9, #16 +.Lmode26_16x16_row: + subs w9, w9, #1 + st1 {v0.16b}, [x7], x3 + b.gt .Lmode26_16x16_row + b .Lmode26_edge_smooth + + // --- size 8 --- +.Lmode26_8x8: + ldr d0, [x1] // Load top[0..7] once + mov w9, #8 +.Lmode26_8x8_row: + subs w9, w9, #1 + st1 {v0.8b}, [x7], x3 + b.gt .Lmode26_8x8_row + b .Lmode26_edge_smooth + + // --- size 4 --- +.Lmode26_4x4: + ldr s0, [x1] // Load top[0..3] once + mov w9, #4 +.Lmode26_4x4_row: + subs w9, w9, #1 + str s0, [x7] + add x7, x7, x3 + b.gt .Lmode26_4x4_row + +.Lmode26_edge_smooth: + cbnz w4, .Lmode26_ret + cmp w5, #5 + b.ge .Lmode26_ret + + // Edge smoothing: out[y] = clip8(top[0] + (left[y] - left[-1]) / 2) + ld1r {v5.16b}, [x1] // top[0] broadcast + sub x8, x2, #1 + ld1r {v1.16b}, [x8] // left[-1] broadcast + + cmp w5, #3 + b.lt .Lmode26_smooth_4 + b.eq .Lmode26_smooth_8 + + // size 16 + ldr q2, [x2] // left[0..15] + uhsub v2.16b, v2.16b, v1.16b // signed half-difference + usqadd v5.16b, v2.16b // sat_u8(top[0] + signed_delta) + + // Store smoothed column[0] for 16 rows using precomputed addresses + // Reordered to avoid direct dependency chains + add x10, x0, x3, lsl #1 // x10 = row 2 + add x9, x0, x3 // x9 = row 1 + add x11, x10, x3 // x11 = row 3 + st1 {v5.b}[0], [x0] + add x0, x10, x3, lsl #1 // x0 = row 4 (after last use of old x0) + st1 {v5.b}[1], [x9] + st1 {v5.b}[2], [x10] + st1 {v5.b}[3], [x11] + add x10, x0, x3, lsl #1 // x10 = row 6 + add x9, x0, x3 // x9 = row 5 + add x11, x10, x3 // x11 = row 7 + st1 {v5.b}[4], [x0] + add x0, x10, x3, lsl #1 // x0 = row 8 + st1 {v5.b}[5], [x9] + st1 {v5.b}[6], [x10] + st1 {v5.b}[7], [x11] + add x10, x0, x3, lsl #1 // x10 = row 10 + add x9, x0, x3 // x9 = row 9 + add x11, x10, x3 // x11 = row 11 + st1 {v5.b}[8], [x0] + add x0, x10, x3, lsl #1 // x0 = row 12 + st1 {v5.b}[9], [x9] + st1 {v5.b}[10], [x10] + st1 {v5.b}[11], [x11] + add x10, x0, x3, lsl #1 // x10 = row 14 + add x9, x0, x3 // x9 = row 13 + add x11, x10, x3 // x11 = row 15 + st1 {v5.b}[12], [x0] + st1 {v5.b}[13], [x9] + st1 {v5.b}[14], [x10] + st1 {v5.b}[15], [x11] + b .Lmode26_ret + +.Lmode26_smooth_4: + ldr s2, [x2] // left[0..3] + uhsub v2.8b, v2.8b, v1.8b + usqadd v5.8b, v2.8b + add x10, x0, x3, lsl #1 + add x9, x0, x3 + add x11, x10, x3 + st1 {v5.b}[0], [x0] + st1 {v5.b}[1], [x9] + st1 {v5.b}[2], [x10] + st1 {v5.b}[3], [x11] + b .Lmode26_ret + +.Lmode26_smooth_8: + ldr d2, [x2] // left[0..7] + uhsub v2.8b, v2.8b, v1.8b + usqadd v5.8b, v2.8b + add x10, x0, x3, lsl #1 // x10 = row 2 + add x9, x0, x3 // x9 = row 1 + add x11, x10, x3 // x11 = row 3 + st1 {v5.b}[0], [x0] + add x0, x10, x3, lsl #1 // x0 = row 4 + st1 {v5.b}[1], [x9] + st1 {v5.b}[2], [x10] + st1 {v5.b}[3], [x11] + add x10, x0, x3, lsl #1 + add x9, x0, x3 + add x11, x10, x3 + st1 {v5.b}[4], [x0] + st1 {v5.b}[5], [x9] + st1 {v5.b}[6], [x10] + st1 {v5.b}[7], [x11] + b .Lmode26_ret + +.Lmode26_ret: + ret +endfunc diff --git a/libavcodec/hevc/pred.c b/libavcodec/hevc/pred.c index 480b1154e6..f8131b1e8c 100644 --- a/libavcodec/hevc/pred.c +++ b/libavcodec/hevc/pred.c @@ -55,10 +55,10 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ hpc->pred_dc = FUNC(pred_dc, depth); \ - hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ - hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ - hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ - hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_angular[0] = FUNC(ff_hevc_pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(ff_hevc_pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(ff_hevc_pred_angular_2, depth); \ + hpc->pred_angular[3] = FUNC(ff_hevc_pred_angular_3, depth); \ hpc->ref_filter_3tap[0] = FUNC(ref_filter_3tap, depth); \ hpc->ref_filter_3tap[1] = FUNC(ref_filter_3tap, depth); \ hpc->ref_filter_3tap[2] = FUNC(ref_filter_3tap, depth); \ diff --git a/libavcodec/hevc/pred.h b/libavcodec/hevc/pred.h index 69e2d84d2b..849806fefb 100644 --- a/libavcodec/hevc/pred.h +++ b/libavcodec/hevc/pred.h @@ -52,4 +52,26 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth); void ff_hevc_pred_init_aarch64(HEVCPredContext *hpc, int bit_depth); +/* C angular prediction fallbacks (non-static for arch-specific partial override) */ +#define HEVC_PRED_ANGULAR_DECL(depth) \ +void ff_hevc_pred_angular_0_ ## depth(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride, \ + int c_idx, int mode); \ +void ff_hevc_pred_angular_1_ ## depth(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride, \ + int c_idx, int mode); \ +void ff_hevc_pred_angular_2_ ## depth(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride, \ + int c_idx, int mode); \ +void ff_hevc_pred_angular_3_ ## depth(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride, \ + int c_idx, int mode); + +HEVC_PRED_ANGULAR_DECL(8) +HEVC_PRED_ANGULAR_DECL(9) +HEVC_PRED_ANGULAR_DECL(10) +HEVC_PRED_ANGULAR_DECL(12) + +#undef HEVC_PRED_ANGULAR_DECL + #endif /* AVCODEC_HEVC_PRED_H */ diff --git a/libavcodec/hevc/pred_template.c b/libavcodec/hevc/pred_template.c index e6069fd267..6f2d934a7b 100644 --- a/libavcodec/hevc/pred_template.c +++ b/libavcodec/hevc/pred_template.c @@ -542,30 +542,30 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, } } -static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, - const uint8_t *left, - ptrdiff_t stride, int c_idx, int mode) +void FUNC(ff_hevc_pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int c_idx, int mode) { FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2); } -static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, - const uint8_t *left, - ptrdiff_t stride, int c_idx, int mode) +void FUNC(ff_hevc_pred_angular_1)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int c_idx, int mode) { FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3); } -static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, - const uint8_t *left, - ptrdiff_t stride, int c_idx, int mode) +void FUNC(ff_hevc_pred_angular_2)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int c_idx, int mode) { FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4); } -static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, - const uint8_t *left, - ptrdiff_t stride, int c_idx, int mode) +void FUNC(ff_hevc_pred_angular_3)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int c_idx, int mode) { FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
