Add ARM64 NEON-accelerated unscaled YUV-to-RGB conversion for planar YUV input formats. This extends the existing NV12/NV21 NEON paths with YUV420P, YUV422P, and YUVA420P support for all packed RGB output formats (ARGB, RGBA, ABGR, BGRA, RGB24, BGR24) and planar GBRP.
Register with ff_yuv2rgb_init_aarch64() to also cover the scaled path. checkasm: all 42 sw_yuv2rgb tests pass. Speedup vs C at 1920px width (Apple M3 Max, avg of 20 runs): yuv420p->rgb24: 4.3x yuv420p->argb: 3.1x yuv422p->rgb24: 5.5x yuv422p->argb: 4.1x yuva420p->argb: 3.5x yuva420p->rgba: 3.5x Signed-off-by: David Christle <[email protected]> --- libswscale/aarch64/swscale_unscaled.c | 90 ++++++++++++++++++ libswscale/aarch64/yuv2rgb_neon.S | 130 +++++++++++++++++++++++--- libswscale/swscale_internal.h | 1 + libswscale/yuv2rgb.c | 2 + 4 files changed, 208 insertions(+), 15 deletions(-) diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c index fdecafd94b..ba24775210 100644 --- a/libswscale/aarch64/swscale_unscaled.c +++ b/libswscale/aarch64/swscale_unscaled.c @@ -89,10 +89,45 @@ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \ DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb24) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24) \ DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p) DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p) +#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \ +int ff_yuva420p_to_##ofmt##_neon(int w, int h, \ + uint8_t *dst, int linesize, \ + const uint8_t *srcY, int linesizeY, \ + const uint8_t *srcU, int linesizeU, \ + const uint8_t *srcV, int linesizeV, \ + const int16_t *table, \ + int y_offset, int y_coeff, \ + const uint8_t *srcA, int linesizeA); \ + \ +static int yuva420p_to_##ofmt##_neon_wrapper(SwsInternal *c, \ + const uint8_t *const src[], \ + const int srcStride[], int srcSliceY, \ + int srcSliceH, uint8_t *const dst[], \ + const int dstStride[]) { \ + const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ + \ + return ff_yuva420p_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ + dst[0] + srcSliceY * dstStride[0], dstStride[0], \ + src[0], srcStride[0], \ + src[1], srcStride[1], \ + src[2], srcStride[2], \ + yuv2rgb_table, \ + c->yuv2rgb_y_offset >> 6, \ + c->yuv2rgb_y_coeff, \ + src[3], srcStride[3]); \ +} + +DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(argb) +DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(rgba) +DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(abgr) +DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(bgra) + #define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \ int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ uint8_t *dst, int linesize, \ @@ -176,6 +211,8 @@ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \ DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb24) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24) \ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12) DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) @@ -199,6 +236,8 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb24, RGB24, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \ } while (0) static void get_unscaled_swscale_neon(SwsInternal *c) { @@ -208,6 +247,13 @@ static void get_unscaled_swscale_neon(SwsInternal *c) { SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd); if (c->opts.dst_format == AV_PIX_FMT_YUV420P && (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) && @@ -221,3 +267,47 @@ void ff_get_unscaled_swscale_aarch64(SwsInternal *c) if (have_neon(cpu_flags)) get_unscaled_swscale_neon(c); } + +av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (!have_neon(cpu_flags) || + (c->opts.src_h & 1) || (c->opts.src_w & 15) || + (c->opts.flags & SWS_ACCURATE_RND)) + return NULL; + + if (c->opts.src_format == AV_PIX_FMT_YUV420P) { + switch (c->opts.dst_format) { + case AV_PIX_FMT_ARGB: return yuv420p_to_argb_neon_wrapper; + case AV_PIX_FMT_RGBA: return yuv420p_to_rgba_neon_wrapper; + case AV_PIX_FMT_ABGR: return yuv420p_to_abgr_neon_wrapper; + case AV_PIX_FMT_BGRA: return yuv420p_to_bgra_neon_wrapper; + case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper; + case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper; + case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper; + } + } else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) { + switch (c->opts.dst_format) { +#if CONFIG_SWSCALE_ALPHA + case AV_PIX_FMT_ARGB: return yuva420p_to_argb_neon_wrapper; + case AV_PIX_FMT_RGBA: return yuva420p_to_rgba_neon_wrapper; + case AV_PIX_FMT_ABGR: return yuva420p_to_abgr_neon_wrapper; + case AV_PIX_FMT_BGRA: return yuva420p_to_bgra_neon_wrapper; +#endif + case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper; + case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper; + case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper; + } + } else if (c->opts.src_format == AV_PIX_FMT_YUV422P) { + switch (c->opts.dst_format) { + case AV_PIX_FMT_ARGB: return yuv422p_to_argb_neon_wrapper; + case AV_PIX_FMT_RGBA: return yuv422p_to_rgba_neon_wrapper; + case AV_PIX_FMT_ABGR: return yuv422p_to_abgr_neon_wrapper; + case AV_PIX_FMT_BGRA: return yuv422p_to_bgra_neon_wrapper; + case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper; + case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper; + case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper; + } + } + return NULL; +} diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 0797a6d5e0..19f750545f 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -55,7 +55,17 @@ load_dst1_dst2 24, 32, 40, 48 sub w3, w3, w0 // w3 = linesize - width (padding) .else + .ifc \ofmt,rgb24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else + .ifc \ofmt,bgr24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif .endif sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0 // w7 = linesizeC - width (paddingC) @@ -78,7 +88,17 @@ load_dst1_dst2 40, 48, 56, 64 sub w3, w3, w0 // w3 = linesize - width (padding) .else + .ifc \ofmt,rgb24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else + .ifc \ofmt,bgr24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif .endif sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) @@ -87,6 +107,18 @@ neg w11, w11 .endm +.macro load_args_yuva420p ofmt + load_args_yuv420p \ofmt +#if defined(__APPLE__) + ldr x15, [sp, #32] // srcA + ldr w16, [sp, #40] // linesizeA +#else + ldr x15, [sp, #40] // srcA + ldr w16, [sp, #48] // linesizeA +#endif + sub w16, w16, w0 // w16 = linesizeA - width (paddingA) +.endm + .macro load_args_yuv422p ofmt ldr x13, [sp] // srcV ldr w14, [sp, #8] // linesizeV @@ -99,7 +131,17 @@ load_dst1_dst2 40, 48, 56, 64 sub w3, w3, w0 // w3 = linesize - width (padding) .else + .ifc \ofmt,rgb24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else + .ifc \ofmt,bgr24 + add w17, w0, w0, lsl #1 + sub w3, w3, w17 // w3 = linesize - width * 3 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif .endif sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) @@ -125,6 +167,10 @@ ushll v19.8h, v17.8b, #3 .endm +.macro load_chroma_yuva420p + load_chroma_yuv420p +.endm + .macro load_chroma_yuv422p load_chroma_yuv420p .endm @@ -147,6 +193,11 @@ add x13, x13, w17, sxtw // srcV += incV .endm +.macro increment_yuva420p + increment_yuv420p + add x15, x15, w16, sxtw // srcA += paddingA (every row) +.endm + .macro increment_yuv422p add x6, x6, w7, sxtw // srcU += incU add x13, x13, w14, sxtw // srcV += incV @@ -169,65 +220,103 @@ .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 compute_rgb \r1, \g1, \b1, \r2, \g2, \b2 - movi \a1, #255 - movi \a2, #255 + mov \a1, v30.8b + mov \a2, v30.8b +.endm + +.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2 + compute_rgb \r1, \g1, \b1, \r2, \g2, \b2 + mov \a1, v28.8b // real alpha (first 8 pixels) + mov \a2, v29.8b // real alpha (next 8 pixels) .endm .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 load_args_\ifmt \ofmt + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) + movi v30.8b, #255 // alpha = 255 (loop-invariant) mov w9, w1 1: mov w8, w0 // w8 = width 2: - movi v5.8h, #4, lsl #8 // 128 * (1<<3) load_chroma_\ifmt - sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3) - sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3) + sub v18.8h, v18.8h, v31.8h // U*(1<<3) - 128*(1<<3) + sub v19.8h, v19.8h, v31.8h // V*(1<<3) - 128*(1<<3) sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g + ld1 {v2.16b}, [x4], #16 // load luma (interleaved) +.ifc \ifmt,yuva420p + ld1 {v28.8b, v29.8b}, [x15], #16 // load 16 alpha bytes +.endif sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g - add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) - zip2 v21.8h, v20.8h, v20.8h // R2 - zip1 v20.8h, v20.8h, v20.8h // R1 - zip2 v23.8h, v22.8h, v22.8h // G2 - zip1 v22.8h, v22.8h, v22.8h // G1 - zip2 v25.8h, v24.8h, v24.8h // B2 - zip1 v24.8h, v24.8h, v24.8h // B1 - ld1 {v2.16b}, [x4], #16 // load luma ushll v26.8h, v2.8b, #3 // Y1*(1<<3) ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3) + add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset + zip2 v21.8h, v20.8h, v20.8h // R2 + zip1 v20.8h, v20.8h, v20.8h // R1 sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 + zip2 v23.8h, v22.8h, v22.8h // G2 + zip1 v22.8h, v22.8h, v22.8h // G1 + zip2 v25.8h, v24.8h, v24.8h // B2 + zip1 v24.8h, v24.8h, v24.8h // B1 .ifc \ofmt,argb // 1 2 3 0 + .ifc \ifmt,yuva420p + compute_rgba_alpha v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b + .else compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b + .endif .endif .ifc \ofmt,rgba // 0 1 2 3 + .ifc \ifmt,yuva420p + compute_rgba_alpha v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b + .else compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b + .endif .endif .ifc \ofmt,abgr // 3 2 1 0 + .ifc \ifmt,yuva420p + compute_rgba_alpha v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b + .else compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b + .endif .endif .ifc \ofmt,bgra // 2 1 0 3 + .ifc \ifmt,yuva420p + compute_rgba_alpha v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b + .else compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b + .endif .endif -.ifc \ofmt,gbrp +.ifc \ofmt,rgb24 + compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b + st3 { v4.8b, v5.8b, v6.8b}, [x2], #24 + st3 {v16.8b,v17.8b,v18.8b}, [x2], #24 +.else + .ifc \ofmt,bgr24 + compute_rgb v6.8b,v5.8b,v4.8b, v18.8b,v17.8b,v16.8b + st3 { v4.8b, v5.8b, v6.8b}, [x2], #24 + st3 {v16.8b,v17.8b,v18.8b}, [x2], #24 + .else + .ifc \ofmt,gbrp compute_rgb v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b st1 { v4.8b, v5.8b }, [x2], #16 st1 { v6.8b, v7.8b }, [x10], #16 st1 { v18.8b, v19.8b }, [x15], #16 -.else + .else st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 + .endif + .endif .endif subs w8, w8, #16 // width -= 16 b.gt 2b @@ -251,9 +340,20 @@ endfunc declare_func \ifmt, abgr declare_func \ifmt, bgra declare_func \ifmt, gbrp + declare_func \ifmt, rgb24 + declare_func \ifmt, bgr24 .endm declare_rgb_funcs nv12 declare_rgb_funcs nv21 declare_rgb_funcs yuv420p declare_rgb_funcs yuv422p + +.macro declare_yuva_funcs ifmt + declare_func \ifmt, argb + declare_func \ifmt, rgba + declare_func \ifmt, abgr + declare_func \ifmt, bgra +.endm + +declare_yuva_funcs yuva420p diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 5c58272664..c671f1c7cd 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -739,6 +739,7 @@ av_cold int ff_sws_fill_xyztables(SwsInternal *c); SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c); SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c); SwsFunc ff_yuv2rgb_init_loongarch(SwsInternal *c); +SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c); static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt) { diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index 48089760f5..c62201856d 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -568,6 +568,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsInternal *c) t = ff_yuv2rgb_init_x86(c); #elif ARCH_LOONGARCH64 t = ff_yuv2rgb_init_loongarch(c); +#elif ARCH_AARCH64 + t = ff_yuv2rgb_init_aarch64(c); #endif if (t) -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
