This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit d9e2239f3c5e17d908db0b79f68ff06ee1867828 Author: DROOdotFOO <[email protected]> AuthorDate: Sat May 30 00:38:39 2026 +0200 Commit: Ramiro Polla <[email protected]> CommitDate: Sat Jun 6 19:38:40 2026 +0200 swscale/aarch64/yuv2rgb_neon: 2 lines at a time, yuva420p Alpha is full resolution, so each row loads its own 16 alpha bytes via process_row's \rsrcA arg. Test Name A55-gcc M1-clang A76-gcc ---------------------------------------------------------------------------------------- yuva420p_to_argb_neon 22607.6 (1.16x) 39.2 (1.24x) 13631.6 (1.12x) yuva420p_to_rgba_neon 22608.2 (1.16x) 38.3 (1.21x) 13912.8 (1.12x) yuva420p_to_abgr_neon 23074.6 (1.16x) 38.8 (1.22x) 14492.1 (1.08x) yuva420p_to_bgra_neon 23079.7 (1.16x) 39.9 (1.19x) 14472.6 (1.08x) Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/yuv2rgb_neon.S | 128 ++++++++++++++++++++++++++++++++++---- 1 file changed, 115 insertions(+), 13 deletions(-) diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 22cbeb8404..19e0f1d6a3 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -102,6 +102,10 @@ // AAPCS callee-saved range and the 2-line gbrp prologue spills them. #define l2_dst1 x19 #define l2_dst2 x20 +// yuva420p 2-line carries a per-row alpha pointer (alpha is full +// resolution -- each output row reads its own 16 bytes). x14 is free +// for the yuva packed variants (no planar gbrp dst there). +#define l2_srcA x14 // -------------------------------------------------------------------- // Source-side argument unpacking. @@ -204,6 +208,27 @@ sub srcPaddingV, srcPaddingV, widthx, lsr #1 .endm +.macro src_load_args_yuva420p_2l + ldp srcPaddingYw, srcPaddingUw, [x6] + ldr srcPaddingVw, [x6, #8] + ldr srcPaddingAw, [x6, #12] // srcStride[3] + ldp srcY, srcU, [x5] + ldr srcV, [x5, #16] + ldr srcA, [x5, #24] // src[3] + sxtw srcPaddingY, srcPaddingYw + sxtw srcPaddingU, srcPaddingUw + sxtw srcPaddingV, srcPaddingVw + sxtw srcPaddingA, srcPaddingAw + add l2_srcY, srcY, srcPaddingY // l2_srcY = srcY + linesizeY + add l2_srcA, srcA, srcPaddingA // l2_srcA = srcA + linesizeA + lsl srcPaddingY, srcPaddingY, #1 + lsl srcPaddingA, srcPaddingA, #1 + sub srcPaddingY, srcPaddingY, widthx // = 2*linesizeY - width + sub srcPaddingU, srcPaddingU, widthx, lsr #1 + sub srcPaddingV, srcPaddingV, widthx, lsr #1 + sub srcPaddingA, srcPaddingA, widthx // = 2*linesizeA - width +.endm + // -------------------------------------------------------------------- // Destination-side argument unpacking. @@ -451,6 +476,15 @@ add srcV, srcV, srcPaddingV .endm +.macro src_increment_yuva420p_2l + add srcY, srcY, srcPaddingY + add l2_srcY, l2_srcY, srcPaddingY + add srcU, srcU, srcPaddingU + add srcV, srcV, srcPaddingV + add srcA, srcA, srcPaddingA + add l2_srcA, l2_srcA, srcPaddingA +.endm + .macro dst_increment_packed_2l add dst0, dst0, dstPadding0 add l2_dst0, l2_dst0, dstPadding0 @@ -526,37 +560,60 @@ sqdmulh v27.8h, v27.8h, v0.8h // (Y2 * y_coeff) >> 15 .endm -// Process one output row for the 2-lines path: load 16 luma px from \rsrcY, -// combine with the shared chroma offsets in v20-v25, and store 16 px in -// format \ofmt. Packed callers pass the same dst three times; rsrcA, rdst1, -// rdst2 are reserved for the gbrp/yuva extensions added in later commits. +// Process one output row: load 16 luma px from \rsrcY, combine with the +// shared chroma offsets in v20-v25, and store 16 px in format \ofmt. +// Packed callers pass the same dst three times. .macro process_row ifmt, ofmt, rsrcY, rsrcA, rdst0, rdst1, rdst2 + set_rgb16_predicates \ofmt load_luma \rsrcY +.ifc \ifmt,yuva420p + ld1 {v28.8b, v29.8b}, [\rsrcA], #16 // 16 alpha bytes +.endif .ifc \ofmt,argb // a r g b compute_rgb v5, v6, v7, v17, v18, v19 + .ifc \ifmt,yuva420p + mov v4.8b, v28.8b + mov v16.8b, v29.8b + .else mov v4.8b, v30.8b mov v16.8b, v30.8b + .endif st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 .endif .ifc \ofmt,rgba // r g b a compute_rgb v4, v5, v6, v16, v17, v18 + .ifc \ifmt,yuva420p + mov v7.8b, v28.8b + mov v19.8b, v29.8b + .else mov v7.8b, v30.8b mov v19.8b, v30.8b + .endif st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 .endif .ifc \ofmt,abgr // a b g r compute_rgb v7, v6, v5, v19, v18, v17 + .ifc \ifmt,yuva420p + mov v4.8b, v28.8b + mov v16.8b, v29.8b + .else mov v4.8b, v30.8b mov v16.8b, v30.8b + .endif st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 .endif .ifc \ofmt,bgra // b g r a compute_rgb v6, v5, v4, v18, v17, v16 + .ifc \ifmt,yuva420p + mov v7.8b, v28.8b + mov v19.8b, v29.8b + .else mov v7.8b, v30.8b mov v19.8b, v30.8b + .endif st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 .endif @@ -630,13 +687,15 @@ // 16bpp packing uses v8/v9 as the accumulator. AAPCS-64 requires d8/d9 // callee-saved (low 64 bits of v8/v9); other ofmts don't touch v8-v15, // so the spill is gated on rgb16. -.macro save_d8_d9_if_16bpp +.macro save_d8_d9_if_16bpp ofmt + set_rgb16_predicates \ofmt .if rgb16 stp d8, d9, [sp, #-0x10]! .endif .endm -.macro restore_d8_d9_if_16bpp +.macro restore_d8_d9_if_16bpp ofmt + set_rgb16_predicates \ofmt .if rgb16 ldp d8, d9, [sp], #0x10 .endif @@ -678,14 +737,13 @@ .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 - set_rgb16_predicates \ofmt uxtw widthx, width // ensure upper 32 bits of widthx are zero dup v3.8h, y_offset // broadcast y_offset before w2 is reused dup v0.8h, y_coeff // broadcast y_coeff before w3 is reused ld1 {v1.1d}, [table_ptr] // load yuv2rgb_table before x4 is reused src_load_args_\ifmt dst_load_args_\ofmt - save_d8_d9_if_16bpp + save_d8_d9_if_16bpp \ofmt movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) movi v30.8b, #255 // alpha = 255 (loop-invariant) @@ -797,7 +855,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 subs height, height, #1 // height -= 1 b.gt 1b mov w0, orig_height // return orig_height - restore_d8_d9_if_16bpp + restore_d8_d9_if_16bpp \ofmt ret endfunc .endm @@ -824,14 +882,13 @@ endfunc // subsampled sources (chrSrcVSubSample > 0). .macro declare_2l_packed ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 - set_rgb16_predicates \ofmt uxtw widthx, width dup v3.8h, y_offset dup v0.8h, y_coeff ld1 {v1.1d}, [table_ptr] src_load_args_\ifmt\()_2l dst_load_args_\ofmt\()_2l - save_d8_d9_if_16bpp + save_d8_d9_if_16bpp \ofmt movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) movi v30.8b, #255 // alpha = 255 (loop-invariant) @@ -850,7 +907,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 subs height, height, #2 b.gt 1b mov w0, orig_height - restore_d8_d9_if_16bpp + restore_d8_d9_if_16bpp \ofmt ret endfunc .endm @@ -940,4 +997,49 @@ declare_rgb16_funcs yuv422p declare_func \ifmt, bgra .endm -declare_yuva_funcs yuva420p +// 2-lines-at-a-time path for yuva420p -> {argb,rgba,abgr,bgra}. Chroma +// is vertically subsampled and shared between the two output rows; the +// alpha plane is full resolution, so each row loads its own 16 alpha +// bytes via process_row's \rsrcA arg (srcA / l2_srcA). The constant +// alpha (v30) is never read in this path, so its prologue movi is +// omitted. +.macro declare_2l_yuva ofmt +.ifc \ofmt,gbrp + .error "yuva420p->gbrp is dispatched through the yuv420p path (gbrp has no alpha channel)" +.endif +function ff_yuva420p_to_\ofmt\()_neon, export=1 + uxtw widthx, width + dup v3.8h, y_offset + dup v0.8h, y_coeff + ld1 {v1.1d}, [table_ptr] + src_load_args_yuva420p_2l + dst_load_args_\ofmt\()_2l + + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) + mov orig_height, height +1: + mov cur_width, width +2: + load_chroma_yuv420p + chroma_to_rgb_offsets + process_row yuva420p, \ofmt, srcY, srcA, dst0, dst0, dst0 + process_row yuva420p, \ofmt, l2_srcY, l2_srcA, l2_dst0, l2_dst0, l2_dst0 + subs cur_width, cur_width, #16 + b.gt 2b + dst_increment_packed_2l + src_increment_yuva420p_2l + subs height, height, #2 + b.gt 1b + mov w0, orig_height + ret +endfunc +.endm + +.macro declare_yuva_funcs_2l + declare_2l_yuva argb + declare_2l_yuva rgba + declare_2l_yuva abgr + declare_2l_yuva bgra +.endm + +declare_yuva_funcs_2l _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
