PR #23272 opened by DROO AMOR (DROOdotFOO) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23272 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23272.patch
# swscale/aarch64/yuv2rgb_neon: 2 lines at a time The 2-lines-at-a-time follow-up discussed on #23152. For vertically-subsampled inputs (nv12, nv21, yuv420p, yuva420p) two output rows share a chroma row, so the chroma->RGB offsets are computed once and applied to both rows instead of being recomputed per row. yuv422p is unchanged (full-height chroma, nothing to reuse). The mechanism is Ramiro Polla's, from his unsent follow-up to 2e142e52ae. **Result:** ~14-24% fewer NEON cycles at width=1920 on Apple M1 across the converted formats (yuv422p within +-2%). ### Testing - `checkasm --test=sw_yuv2rgb`: 110/110 - full `checkasm`: 7657/7657 - `make fate`: clean - `tools/patcheck`: no new warnings CPU: Apple M1 (`sysctl -n machdep.cpu.brand_string`). Co-authored-by: Ramiro Polla <[email protected]> >From cd986c2e37b87dfdfcee60c22db24620a9261674 Mon Sep 17 00:00:00 2001 From: DROOdotFOO <[email protected]> Date: Fri, 29 May 2026 18:43:17 +0200 Subject: [PATCH] swscale/aarch64/yuv2rgb_neon: 2 lines at a time For the vertically-subsampled inputs (nv12, nv21, yuv420p, yuva420p) two output rows share one chroma row. Process both rows in the inner loop and derive the chroma->RGB offsets once, instead of the single-row path that rewinds the chroma pointer to recompute them for every row. yuv422p keeps the single-row path; its chroma is full height, so there is nothing to reuse. yuva420p loads its full-resolution alpha once per row. The mechanism is from Ramiro Polla's unsent follow-up to 2e142e52ae. NEON cycles, Apple M1, width=1920 (checkasm --bench), single-row -> two-row: | input | argb | rgb24 | gbrp | rgb565le | |---------|------------|------------|------------|------------| | yuv420p | 43.6->35.2 | 38.2->30.2 | 36.0->28.3 | 49.3->41.8 | | nv12 | 45.3->36.7 | 38.7->30.6 | 38.0->29.8 | 52.8->42.8 | | nv21 | 45.4->36.3 | 40.1->30.6 | 38.4->30.1 | 50.7->43.2 | yuva420p packed runs the same band (argb 44.0->35.7); its rgb24/gbrp/16bpp reuse the yuv420p path. ~14-24% fewer cycles across the converted formats; yuv422p (unchanged) stays within +-2%. Verified with checkasm --test=sw_yuv2rgb (110/110) and the full checkasm regression (7657/7657) on Apple M1. Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/yuv2rgb_neon.S | 580 ++++++++++++++++++++++-------- 1 file changed, 434 insertions(+), 146 deletions(-) diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 484d630998..7602cf428d 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -45,88 +45,6 @@ sub w16, w16, w0 // w16 = linesize2 - width (padding2) .endm -.macro load_args_nv12 ofmt - ldr x8, [sp] // table - load_yoff_ycoeff 8, 16 // y_offset, y_coeff - ld1 {v1.1d}, [x8] - dup v0.8h, w10 - dup v3.8h, w9 -.ifc \ofmt,gbrp - load_dst1_dst2 24, 32, 40, 48 - sub w3, w3, w0 // w3 = linesize - width (padding) -.else - .ifc \ofmt,rgb24 - add w17, w0, w0, lsl #1 - sub w3, w3, w17 // w3 = linesize - width * 3 (padding) - .else - .ifc \ofmt,bgr24 - add w17, w0, w0, lsl #1 - sub w3, w3, w17 // w3 = linesize - width * 3 (padding) - .else - .if rgb16 - sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) - .else - sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) - .endif - .endif - .endif -.endif - sub w5, w5, w0 // w5 = linesizeY - width (paddingY) - sub w7, w7, w0 // w7 = linesizeC - width (paddingC) - neg w11, w0 -.endm - -.macro load_args_nv21 ofmt - load_args_nv12 \ofmt -.endm - -.macro load_args_yuv420p ofmt - ldr x13, [sp] // srcV - ldr w14, [sp, #8] // linesizeV - ldr x8, [sp, #16] // table - load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1d}, [x8] - dup v0.8h, w10 - dup v3.8h, w9 -.ifc \ofmt,gbrp - load_dst1_dst2 40, 48, 56, 64 - sub w3, w3, w0 // w3 = linesize - width (padding) -.else - .ifc \ofmt,rgb24 - add w17, w0, w0, lsl #1 - sub w3, w3, w17 // w3 = linesize - width * 3 (padding) - .else - .ifc \ofmt,bgr24 - add w17, w0, w0, lsl #1 - sub w3, w3, w17 // w3 = linesize - width * 3 (padding) - .else - .if rgb16 - sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) - .else - sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) - .endif - .endif - .endif -.endif - sub w5, w5, w0 // w5 = linesizeY - width (paddingY) - sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) - sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV) - lsr w11, w0, #1 - neg w11, w11 -.endm - -.macro load_args_yuva420p ofmt - load_args_yuv420p \ofmt -#if defined(__APPLE__) - ldr x15, [sp, #32] // srcA - ldr w16, [sp, #40] // linesizeA -#else - ldr x15, [sp, #40] // srcA - ldr w16, [sp, #48] // linesizeA -#endif - sub w16, w16, w0 // w16 = linesizeA - width (paddingA) -.endm - .macro load_args_yuv422p ofmt ldr x13, [sp] // srcV ldr w14, [sp, #8] // linesizeV @@ -179,37 +97,10 @@ ushll v19.8h, v17.8b, #3 .endm -.macro load_chroma_yuva420p - load_chroma_yuv420p -.endm - .macro load_chroma_yuv422p load_chroma_yuv420p .endm -.macro increment_nv12 - ands w17, w1, #1 - csel w17, w7, w11, ne // incC = (h & 1) ? paddincC : -width - add x6, x6, w17, sxtw // srcC += incC -.endm - -.macro increment_nv21 - increment_nv12 -.endm - -.macro increment_yuv420p - ands w17, w1, #1 - csel w17, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2 - add x6, x6, w17, sxtw // srcU += incU - csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2 - add x13, x13, w17, sxtw // srcV += incV -.endm - -.macro increment_yuva420p - increment_yuv420p - add x15, x15, w16, sxtw // srcA += paddingA (every row) -.endm - .macro increment_yuv422p add x6, x6, w7, sxtw // srcU += incU add x13, x13, w14, sxtw // srcV += incV @@ -236,10 +127,144 @@ mov \a2, v30.8b .endm -.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2 - compute_rgb \r1, \g1, \b1, \r2, \g2, \b2 - mov \a1, v28.8b // real alpha (first 8 pixels) - mov \a2, v29.8b // real alpha (next 8 pixels) +// Chroma-preserving variant of compute_rgb for the 2-lines-at-a-time +// path: the per-luma sums are written into the destination registers +// instead of v20-v25, so the chroma contributions in v20-v25 survive to +// be reused for the second luma row. Args are bare reg names (e.g. v5). +.macro compute_rgb_2l r1 g1 b1 r2 g2 b2 + add \r1\().8h, v26.8h, v20.8h // Y1 + R1 + add \r2\().8h, v27.8h, v21.8h // Y2 + R2 + add \g1\().8h, v26.8h, v22.8h // Y1 + G1 + add \g2\().8h, v27.8h, v23.8h // Y2 + G2 + add \b1\().8h, v26.8h, v24.8h // Y1 + B1 + add \b2\().8h, v27.8h, v25.8h // Y2 + B2 + sqrshrun \r1\().8b, \r1\().8h, #1 // clip_u8((Y1 + R1) >> 1) + sqrshrun \r2\().8b, \r2\().8h, #1 // clip_u8((Y2 + R2) >> 1) + sqrshrun \g1\().8b, \g1\().8h, #1 // clip_u8((Y1 + G1) >> 1) + sqrshrun \g2\().8b, \g2\().8h, #1 // clip_u8((Y2 + G2) >> 1) + sqrshrun \b1\().8b, \b1\().8h, #1 // clip_u8((Y1 + B1) >> 1) + sqrshrun \b2\().8b, \b2\().8h, #1 // clip_u8((Y2 + B2) >> 1) +.endm + +// Shared chroma -> RGB offsets for the 2-lines path. Consumes the widened +// chroma in v18/v19 (set by load_chroma_<ifmt>) and produces the per-channel +// chroma contributions in v20-v25 (R1,R2,G1,G2,B1,B2). Computed once per +// pixel column and reused by both luma rows via compute_rgb_2l. +.macro chroma_to_rgb_offsets + sub v18.8h, v18.8h, v31.8h // U*(1<<3) - 128*(1<<3) + sub v19.8h, v19.8h, v31.8h // V*(1<<3) - 128*(1<<3) + sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) + sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g + sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g + sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) + add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) + zip2 v21.8h, v20.8h, v20.8h // R2 + zip1 v20.8h, v20.8h, v20.8h // R1 + zip2 v23.8h, v22.8h, v22.8h // G2 + zip1 v22.8h, v22.8h, v22.8h // G1 + zip2 v25.8h, v24.8h, v24.8h // B2 + zip1 v24.8h, v24.8h, v24.8h // B1 +.endm + +// Load and scale 16 luma samples from \rsrcY into v26 (Y1) / v27 (Y2), +// ready for compute_rgb_2l. v0 = y_coeff, v3 = y_offset (loop-invariant). +.macro load_luma rsrcY + ld1 {v2.16b}, [\rsrcY], #16 // load luma + ushll v26.8h, v2.8b, #3 // Y1*(1<<3) + ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3) + sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset + sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset + sqdmulh v26.8h, v26.8h, v0.8h // (Y1 * y_coeff) >> 15 + sqdmulh v27.8h, v27.8h, v0.8h // (Y2 * y_coeff) >> 15 +.endm + +// Process one output row for the 2-lines path: load 16 luma px from \rsrcY, +// combine with the shared chroma offsets (v20-v25), and store 16 px in +// format \ofmt. Packed/16bpp use \rdst0 only; gbrp uses \rdst0/1/2; yuva420p +// reads per-row alpha from \rsrcA. v20-v25 are preserved for the next row. +// The .if rgb16 / r_first / gshift / hshift branch below depends on the +// rgb16 predicates -- the caller MUST run set_rgb16_predicates \ofmt before +// invoking this macro (every declare_2l_* path does so on entry). +.macro process_row ifmt, ofmt, rsrcY, rsrcA, rdst0, rdst1, rdst2 + load_luma \rsrcY +.ifc \ifmt,yuva420p + ld1 {v28.8b, v29.8b}, [\rsrcA], #16 // 16 alpha bytes +.endif +.ifc \ofmt,argb // a r g b + compute_rgb_2l v5, v6, v7, v17, v18, v19 + .ifc \ifmt,yuva420p + mov v4.8b, v28.8b + mov v16.8b, v29.8b + .else + mov v4.8b, v30.8b + mov v16.8b, v30.8b + .endif + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 +.endif +.ifc \ofmt,rgba // r g b a + compute_rgb_2l v4, v5, v6, v16, v17, v18 + .ifc \ifmt,yuva420p + mov v7.8b, v28.8b + mov v19.8b, v29.8b + .else + mov v7.8b, v30.8b + mov v19.8b, v30.8b + .endif + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 +.endif +.ifc \ofmt,abgr // a b g r + compute_rgb_2l v7, v6, v5, v19, v18, v17 + .ifc \ifmt,yuva420p + mov v4.8b, v28.8b + mov v16.8b, v29.8b + .else + mov v4.8b, v30.8b + mov v16.8b, v30.8b + .endif + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 +.endif +.ifc \ofmt,bgra // b g r a + compute_rgb_2l v6, v5, v4, v18, v17, v16 + .ifc \ifmt,yuva420p + mov v7.8b, v28.8b + mov v19.8b, v29.8b + .else + mov v7.8b, v30.8b + mov v19.8b, v30.8b + .endif + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32 +.endif +.ifc \ofmt,rgb24 + compute_rgb_2l v4, v5, v6, v16, v17, v18 + st3 { v4.8b, v5.8b, v6.8b}, [\rdst0], #24 + st3 {v16.8b,v17.8b,v18.8b}, [\rdst0], #24 +.endif +.ifc \ofmt,bgr24 + compute_rgb_2l v6, v5, v4, v18, v17, v16 + st3 { v4.8b, v5.8b, v6.8b}, [\rdst0], #24 + st3 {v16.8b,v17.8b,v18.8b}, [\rdst0], #24 +.endif +.ifc \ofmt,gbrp + compute_rgb_2l v18, v4, v6, v19, v5, v7 + st1 { v4.8b, v5.8b }, [\rdst0], #16 + st1 { v6.8b, v7.8b }, [\rdst1], #16 + st1 { v18.8b, v19.8b }, [\rdst2], #16 +.endif +.if rgb16 + compute_rgb_2l v4, v5, v6, v16, v17, v18 + .if r_first + pack_rgb16_2l v8, v6, v5, v4, gshift, hshift + pack_rgb16_2l v9, v18, v17, v16, gshift, hshift + .else + pack_rgb16_2l v8, v4, v5, v6, gshift, hshift + pack_rgb16_2l v9, v16, v17, v18, gshift, hshift + .endif + st1 { v8.8h, v9.8h}, [\rdst0], #32 +.endif .endm // Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts @@ -309,6 +334,23 @@ sli \dst\().8h, v23.8h, #\high_shl .endm +// As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after +// compute_rgb_2l) instead of v20-v23, so the shared chroma contributions +// in v20-v25 survive for the second luma row. Clobbers v26-v29. +// NB: v28/v29 also hold the per-row alpha in the yuva420p path, so this is +// only safe because yuva420p never reaches a 16bpp output (its 16bpp targets +// are dispatched through the yuv420p path); do not call this for yuva inputs. +.macro pack_rgb16_2l dst, low_ch, mid_ch, high_ch, g_shr, high_shl + ushr v26.8b, \high_ch\().8b, #3 + ushr v27.8b, \mid_ch\().8b, #\g_shr + ushr v28.8b, \low_ch\().8b, #3 + uxtl \dst\().8h, v28.8b + uxtl v29.8h, v27.8b + sli \dst\().8h, v29.8h, #5 + uxtl v29.8h, v26.8b + sli \dst\().8h, v29.8h, #\high_shl +.endm + .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 set_rgb16_predicates \ofmt @@ -327,9 +369,6 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g ld1 {v2.16b}, [x4], #16 // load luma (interleaved) -.ifc \ifmt,yuva420p - ld1 {v28.8b, v29.8b}, [x15], #16 // load 16 alpha bytes -.endif sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) ushll v26.8h, v2.8b, #3 // Y1*(1<<3) @@ -347,35 +386,19 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 zip1 v24.8h, v24.8h, v24.8h // B1 .ifc \ofmt,argb // 1 2 3 0 - .ifc \ifmt,yuva420p - compute_rgba_alpha v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b - .else compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b - .endif .endif .ifc \ofmt,rgba // 0 1 2 3 - .ifc \ifmt,yuva420p - compute_rgba_alpha v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b - .else compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b - .endif .endif .ifc \ofmt,abgr // 3 2 1 0 - .ifc \ifmt,yuva420p - compute_rgba_alpha v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b - .else compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b - .endif .endif .ifc \ofmt,bgra // 2 1 0 3 - .ifc \ifmt,yuva420p - compute_rgba_alpha v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b - .else compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b - .endif .endif .ifc \ofmt,rgb24 @@ -440,9 +463,274 @@ endfunc declare_func \ifmt, bgr24 .endm -declare_rgb_funcs nv12 -declare_rgb_funcs nv21 -declare_rgb_funcs yuv420p +// 2-lines-at-a-time path for the single-dst-pointer output formats +// (argb/rgba/abgr/bgra, rgb24/bgr24, and the four 16bpp LE forms) with +// vertically-subsampled inputs (nv12/nv21/yuv420p). Two consecutive output +// rows share one chroma row, so the chroma -> RGB offsets (v20-v25) are +// computed once and applied to both luma rows, halving the chroma work +// relative to the single-row declare_func (which rewinds the chroma pointer +// to re-derive it per row). ABI is identical to the function it replaces. +// Co-authored with Ramiro Polla's unsent series (PR #23152, item #5). +// Uses caller-saved scratch only (x9=srcY2, x12=dst2); 16bpp additionally +// spills d8/d9 (callee-saved) as the legacy path does. +// Precondition (shared by all declare_2l_* paths): the slice height is even. +// These converters only run for vertically-subsampled (4:2:0) sources, where +// an even slice height is a libswscale contract -- a chroma row is shared by +// two luma rows, so a slice cannot end mid-pair. The dispatch also gates on +// !(src_h & 1). The single-row declare_func relies on the same invariant for +// its chroma pairing; the 2-line loop simply consumes it two rows at a time +// (height -= 2) with no odd-row tail. +.macro declare_2l_packed ifmt ofmt +function ff_\ifmt\()_to_\ofmt\()_neon, export=1 + set_rgb16_predicates \ofmt +.ifc \ifmt,yuv420p + ldr x13, [sp] // srcV + ldr w14, [sp, #8] // linesizeV + ldr x8, [sp, #16] // table + load_yoff_ycoeff 24, 32 // y_offset, y_coeff +.else + ldr x8, [sp] // table + load_yoff_ycoeff 8, 16 // y_offset, y_coeff +.endif + ld1 {v1.1d}, [x8] + dup v0.8h, w10 // y_coeff + dup v3.8h, w9 // y_offset + save_d8_d9_if_16bpp + add x9, x4, w5, sxtw // srcY2 = srcY + linesizeY + add x12, x2, w3, sxtw // dst2 = dst + linesize + lsl w17, w5, #1 + sub w5, w17, w0 // srcY pair stride = 2*linesizeY - width + lsl w17, w3, #1 +.if rgb16 + sub w3, w17, w0, lsl #1 // dst pair stride = 2*linesize - width*2 +.else + .ifc \ofmt,rgb24 + sub w3, w17, w0 + sub w3, w3, w0, lsl #1 // dst pair stride = 2*linesize - width*3 + .else + .ifc \ofmt,bgr24 + sub w3, w17, w0 + sub w3, w3, w0, lsl #1 // dst pair stride = 2*linesize - width*3 + .else + sub w3, w17, w0, lsl #2 // dst pair stride = 2*linesize - width*4 + .endif + .endif +.endif +.ifc \ifmt,yuv420p + sub w7, w7, w0, lsr #1 // paddingU = linesizeU - width/2 + sub w14, w14, w0, lsr #1 // paddingV = linesizeV - width/2 +.else + sub w7, w7, w0 // paddingC = linesizeC - width +.endif + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) + movi v30.8b, #255 // alpha = 255 (loop-invariant) + mov w15, w1 // save original height (return value) +1: + mov w8, w0 // w8 = width +2: + load_chroma_\ifmt + chroma_to_rgb_offsets + process_row \ifmt, \ofmt, x4, x4, x2, x2, x2 // line 1 + process_row \ifmt, \ofmt, x9, x9, x12, x12, x12 // line 2 + subs w8, w8, #16 // width -= 16 + b.gt 2b + add x2, x2, w3, sxtw // dst += pair stride + add x12, x12, w3, sxtw // dst2 += pair stride + add x4, x4, w5, sxtw // srcY += pair stride + add x9, x9, w5, sxtw // srcY2 += pair stride +.ifc \ifmt,yuv420p + add x6, x6, w7, sxtw // srcU += paddingU + add x13, x13, w14, sxtw // srcV += paddingV +.else + add x6, x6, w7, sxtw // srcC += paddingC +.endif + subs w1, w1, #2 // height -= 2 + b.gt 1b + mov w0, w15 + restore_d8_d9_if_16bpp + ret +endfunc +.endm + +.macro declare_rgb_funcs_2l_packed ifmt + declare_2l_packed \ifmt, argb + declare_2l_packed \ifmt, rgba + declare_2l_packed \ifmt, abgr + declare_2l_packed \ifmt, bgra + declare_2l_packed \ifmt, rgb24 + declare_2l_packed \ifmt, bgr24 +.endm + +// 2-lines-at-a-time path for yuva420p -> {argb,rgba,abgr,bgra}. Chroma is +// vertically subsampled and shared between the two output rows, but the +// alpha plane is full resolution, so each row loads its own alpha (x15 line +// 1, x11 line 2). Caller-saved scratch only (x9=srcY2, x11=srcA2, x12=dst2). +.macro declare_2l_yuva ofmt +function ff_yuva420p_to_\ofmt\()_neon, export=1 + set_rgb16_predicates \ofmt // rgb16=0 for yuva packed ofmts + ldr x13, [sp] // srcV + ldr w14, [sp, #8] // linesizeV + ldr x8, [sp, #16] // table + load_yoff_ycoeff 24, 32 // y_offset, y_coeff +#if defined(__APPLE__) + ldr x15, [sp, #32] // srcA + ldr w16, [sp, #40] // linesizeA +#else + ldr x15, [sp, #40] // srcA + ldr w16, [sp, #48] // linesizeA +#endif + ld1 {v1.1d}, [x8] + dup v0.8h, w10 // y_coeff + dup v3.8h, w9 // y_offset + mov w10, w1 // save original height (return value) + add x9, x4, w5, sxtw // srcY2 = srcY + linesizeY + add x12, x2, w3, sxtw // dst2 = dst + linesize + add x11, x15, w16, sxtw // srcA2 = srcA + linesizeA + lsl w17, w5, #1 + sub w5, w17, w0 // srcY pair stride = 2*linesizeY - width + lsl w17, w3, #1 + sub w3, w17, w0, lsl #2 // dst pair stride = 2*linesize - width*4 + lsl w16, w16, #1 + sub w16, w16, w0 // srcA pair stride = 2*linesizeA - width + sub w7, w7, w0, lsr #1 // paddingU = linesizeU - width/2 + sub w14, w14, w0, lsr #1 // paddingV = linesizeV - width/2 + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) + movi v30.8b, #255 // alpha = 255 (unused for yuva) +1: + mov w8, w0 // w8 = width +2: + load_chroma_yuv420p + chroma_to_rgb_offsets + process_row yuva420p, \ofmt, x4, x15, x2, x2, x2 // line 1 + process_row yuva420p, \ofmt, x9, x11, x12, x12, x12 // line 2 + subs w8, w8, #16 // width -= 16 + b.gt 2b + add x2, x2, w3, sxtw // dst += pair stride + add x12, x12, w3, sxtw // dst2 += pair stride + add x4, x4, w5, sxtw // srcY += pair stride + add x9, x9, w5, sxtw // srcY2 += pair stride + add x15, x15, w16, sxtw // srcA += pair stride + add x11, x11, w16, sxtw // srcA2 += pair stride + add x6, x6, w7, sxtw // srcU += paddingU + add x13, x13, w14, sxtw // srcV += paddingV + subs w1, w1, #2 // height -= 2 + b.gt 1b + mov w0, w10 + ret +endfunc +.endm + +.macro declare_yuva_funcs_2l + declare_2l_yuva argb + declare_2l_yuva rgba + declare_2l_yuva abgr + declare_2l_yuva bgra +.endm + +// 2-lines-at-a-time path for {nv12,nv21,yuv420p} -> gbrp (three output +// planes). Like declare_2l_packed but with three dst pointers per line, so +// the second-line plane pointers exhaust the caller-saved registers; x19/x20 +// are spilled (AAPCS callee-saved). All stack args are read before the spill +// so the standard arg offsets apply. Plane line-2 ptrs: x11/x17/x19, srcY2 x20. +.macro declare_2l_gbrp ifmt +function ff_\ifmt\()_to_gbrp_neon, export=1 + set_rgb16_predicates gbrp // rgb16=0 for gbrp +// y_coeff/y_offset must be consumed into v0/v3 before x10 is reloaded with +// dst1 (load_yoff_ycoeff leaves y_coeff in w10). All stack args are read +// before x19/x20 are spilled so the standard arg offsets apply. +.ifc \ifmt,yuv420p + ldr x13, [sp] // srcV + ldr w14, [sp, #8] // linesizeV + ldr x8, [sp, #16] // table + load_yoff_ycoeff 24, 32 // y_offset, y_coeff + ld1 {v1.1d}, [x8] + dup v0.8h, w10 // y_coeff + dup v3.8h, w9 // y_offset +#if defined(__APPLE__) + ldr x10, [sp, #32] // dst1 + ldr w12, [sp, #40] // linesize1 + ldr x15, [sp, #48] // dst2 + ldr w16, [sp, #56] // linesize2 +#else + ldr x10, [sp, #40] + ldr w12, [sp, #48] + ldr x15, [sp, #56] + ldr w16, [sp, #64] +#endif +.else + ldr x8, [sp] // table + load_yoff_ycoeff 8, 16 // y_offset, y_coeff + ld1 {v1.1d}, [x8] + dup v0.8h, w10 // y_coeff + dup v3.8h, w9 // y_offset +#if defined(__APPLE__) + ldr x10, [sp, #16] // dst1 + ldr w12, [sp, #24] // linesize1 + ldr x15, [sp, #32] // dst2 + ldr w16, [sp, #40] // linesize2 +#else + ldr x10, [sp, #24] + ldr w12, [sp, #32] + ldr x15, [sp, #40] + ldr w16, [sp, #48] +#endif +.endif + stp x19, x20, [sp, #-0x10]! // callee-saved (line2 planar ptrs) + mov w9, w1 // save original height (return value) + add x20, x4, w5, sxtw // srcY2 = srcY + linesizeY + lsl w8, w5, #1 + sub w5, w8, w0 // srcY pair stride = 2*linesizeY - width + add x11, x2, w3, sxtw // dst0_2 = dst0 + linesize0 + lsl w8, w3, #1 + sub w3, w8, w0 // dst0 pair stride = 2*linesize0 - width + add x17, x10, w12, sxtw // dst1_2 = dst1 + linesize1 + lsl w8, w12, #1 + sub w12, w8, w0 // dst1 pair stride = 2*linesize1 - width + add x19, x15, w16, sxtw // dst2_2 = dst2 + linesize2 + lsl w8, w16, #1 + sub w16, w8, w0 // dst2 pair stride = 2*linesize2 - width +.ifc \ifmt,yuv420p + sub w7, w7, w0, lsr #1 // paddingU = linesizeU - width/2 + sub w14, w14, w0, lsr #1 // paddingV = linesizeV - width/2 +.else + sub w7, w7, w0 // paddingC = linesizeC - width +.endif + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) +1: + mov w8, w0 // w8 = width +2: + load_chroma_\ifmt + chroma_to_rgb_offsets + process_row \ifmt, gbrp, x4, x4, x2, x10, x15 // line 1 + process_row \ifmt, gbrp, x20, x20, x11, x17, x19 // line 2 + subs w8, w8, #16 // width -= 16 + b.gt 2b + add x2, x2, w3, sxtw // dst0 += pair stride + add x10, x10, w12, sxtw // dst1 += pair stride + add x15, x15, w16, sxtw // dst2 += pair stride + add x11, x11, w3, sxtw // dst0_2 += pair stride + add x17, x17, w12, sxtw // dst1_2 += pair stride + add x19, x19, w16, sxtw // dst2_2 += pair stride + add x4, x4, w5, sxtw // srcY += pair stride + add x20, x20, w5, sxtw // srcY2 += pair stride + add x6, x6, w7, sxtw // srcU/srcC += padding +.ifc \ifmt,yuv420p + add x13, x13, w14, sxtw // srcV += paddingV +.endif + subs w1, w1, #2 // height -= 2 + b.gt 1b + mov w0, w9 + ldp x19, x20, [sp], #0x10 // restore callee-saved + ret +endfunc +.endm + +declare_rgb_funcs_2l_packed nv12 + declare_2l_gbrp nv12 +declare_rgb_funcs_2l_packed nv21 + declare_2l_gbrp nv21 +declare_rgb_funcs_2l_packed yuv420p + declare_2l_gbrp yuv420p declare_rgb_funcs yuv422p .macro declare_rgb16_funcs ifmt @@ -452,16 +740,16 @@ declare_rgb_funcs yuv422p declare_func \ifmt, bgr555le .endm -declare_rgb16_funcs nv12 -declare_rgb16_funcs nv21 -declare_rgb16_funcs yuv420p -declare_rgb16_funcs yuv422p - -.macro declare_yuva_funcs ifmt - declare_func \ifmt, argb - declare_func \ifmt, rgba - declare_func \ifmt, abgr - declare_func \ifmt, bgra +.macro declare_rgb16_funcs_2l ifmt + declare_2l_packed \ifmt, rgb565le + declare_2l_packed \ifmt, bgr565le + declare_2l_packed \ifmt, rgb555le + declare_2l_packed \ifmt, bgr555le .endm -declare_yuva_funcs yuva420p +declare_rgb16_funcs_2l nv12 +declare_rgb16_funcs_2l nv21 +declare_rgb16_funcs_2l yuv420p +declare_rgb16_funcs yuv422p + +declare_yuva_funcs_2l -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
