PR #21026 opened by arpadp-arm URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21026 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21026.patch
This series prepares and optimizes the `xyz12Torgb48` path in swscale. Patch 1: refactors the XYZ/RGB state into a `ColorXform` struct and adds a per-context `xyz12Torgb48` hook with no functional changes. Patch 2: adds `checkasm` coverage for the `xyz12Torgb48` little-endian path. Patch 3: introduces an AArch64 Neon implementation for the little-endian XYZ12 -> RGB48 conversion. Relative runtime of micro benchmarks after this patch on some Cortex and Neoverse CPU cores: ``` xyz12le_rgb48le X1 X3 X4 X925 V2 16x4_neon: 2.39x 4.04x 2.84x 3.27x 3.02x 32x4_neon: 2.42x 3.34x 2.98x 3.34x 2.97x 64x4_neon: 2.36x 3.12x 2.99x 3.32x 2.95x 128x4_neon: 2.36x 3.08x 3.01x 3.34x 2.95x 256x4_neon: 2.33x 3.08x 3.08x 3.41x 2.95x 512x4_neon: 2.30x 3.04x 3.00x 3.54x 2.88x 1024x4_neon: 2.28x 3.01x 2.88x 3.55x 3.07x 1920x4_neon: 2.27x 2.94x 2.79x 3.53x 2.86x xyz12le_rgb48le A76 A78 A715 A720 A725 16x4_neon: 2.36x 2.20x 2.32x 2.99x 2.98x 32x4_neon: 2.40x 2.25x 2.37x 2.99x 3.02x 64x4_neon: 2.37x 2.22x 2.34x 2.97x 3.03x 128x4_neon: 2.35x 2.23x 2.33x 2.93x 3.00x 256x4_neon: 2.39x 2.23x 2.35x 2.88x 2.92x 512x4_neon: 2.39x 2.21x 2.32x 2.81x 2.89x 1024x4_neon: 2.37x 2.18x 2.31x 2.79x 2.89x 1920x4_neon: 2.37x 2.17x 2.30x 2.77x 2.86x xyz12le_rgb48le A55 A510 A520 16x4_neon: 1.98x 1.96x 2.23x 32x4_neon: 2.03x 1.96x 2.20x 64x4_neon: 2.01x 1.95x 2.24x 128x4_neon: 1.99x 1.91x 2.22x 256x4_neon: 1.92x 1.86x 2.22x 512x4_neon: 1.89x 1.80x 2.19x 1024x4_neon: 1.90x 1.80x 2.19x 1920x4_neon: 1.91x 1.79x 2.20x ``` >From 5a4b5dddabb7d163d5062dc312e1dbccd02a132d Mon Sep 17 00:00:00 2001 From: Arpad Panyik <[email protected]> Date: Wed, 26 Nov 2025 16:35:11 +0000 Subject: [PATCH 1/3] swscale: Refactor XYZ+RGB state and add xyz12Torgb48 hook Prepare for xyz12Torgb48 architecture-specific optimizations in subsequent patches by: - Grouping XYZ+RGB gamma LUTs and 3x3 matrices into ColorXform (ctx->xyz2rgb/ ctx->rgb2xyz), replacing scattered fields. - Dropping the unused last matrix column giving the same or smaller SwsInternal size. - Renaming ff_xyz12Torgb48 to xyz12Torgb48_c and routing calls via the new per-context function pointer (ctx->xyz12Torgb48) in graph.c and swscale.c. - Adding ff_sws_init_xyz2rgb and invoking it in swscale init paths (normal and unscaled). These modifications do not introduce any functional changes. Signed-off-by: Arpad Panyik <[email protected]> --- libswscale/graph.c | 3 +- libswscale/swscale.c | 85 +++++++++++++++++++---------------- libswscale/swscale_internal.h | 25 +++++++---- libswscale/swscale_unscaled.c | 2 + libswscale/utils.c | 33 +++++++------- 5 files changed, 83 insertions(+), 65 deletions(-) diff --git a/libswscale/graph.c b/libswscale/graph.c index 0a79b17f89..60ead6e8bb 100644 --- a/libswscale/graph.c +++ b/libswscale/graph.c @@ -142,7 +142,8 @@ static void run_rgb0(const SwsImg *out, const SwsImg *in, int y, int h, static void run_xyz2rgb(const SwsImg *out, const SwsImg *in, int y, int h, const SwsPass *pass) { - ff_xyz12Torgb48(pass->priv, out->data[0] + y * out->linesize[0], out->linesize[0], + const SwsInternal *c = pass->priv; + c->xyz12Torgb48(c, out->data[0] + y * out->linesize[0], out->linesize[0], in->data[0] + y * in->linesize[0], in->linesize[0], pass->width, h); } diff --git a/libswscale/swscale.c b/libswscale/swscale.c index f4c7eccac4..c795427a83 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -660,6 +660,8 @@ static av_cold void sws_init_swscale(SwsInternal *c) { enum AVPixelFormat srcFormat = c->opts.src_format; + ff_sws_init_xyz2rgb(c); + ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX, &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX); @@ -737,8 +739,8 @@ static int check_image_pointers(const uint8_t * const data[4], enum AVPixelForma return 1; } -void ff_xyz12Torgb48(const SwsInternal *c, uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, int w, int h) +static void xyz12Torgb48_c(const SwsInternal *c, uint8_t *dst, int dst_stride, + const uint8_t *src, int src_stride, int w, int h) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->opts.src_format); @@ -759,20 +761,20 @@ void ff_xyz12Torgb48(const SwsInternal *c, uint8_t *dst, int dst_stride, z = AV_RL16(src16 + xp + 2); } - x = c->xyzgamma[x >> 4]; - y = c->xyzgamma[y >> 4]; - z = c->xyzgamma[z >> 4]; + x = c->xyz2rgb.gamma.xyz[x >> 4]; + y = c->xyz2rgb.gamma.xyz[y >> 4]; + z = c->xyz2rgb.gamma.xyz[z >> 4]; // convert from XYZlinear to sRGBlinear - r = c->xyz2rgb_matrix[0][0] * x + - c->xyz2rgb_matrix[0][1] * y + - c->xyz2rgb_matrix[0][2] * z >> 12; - g = c->xyz2rgb_matrix[1][0] * x + - c->xyz2rgb_matrix[1][1] * y + - c->xyz2rgb_matrix[1][2] * z >> 12; - b = c->xyz2rgb_matrix[2][0] * x + - c->xyz2rgb_matrix[2][1] * y + - c->xyz2rgb_matrix[2][2] * z >> 12; + r = c->xyz2rgb.matrix[0][0] * x + + c->xyz2rgb.matrix[0][1] * y + + c->xyz2rgb.matrix[0][2] * z >> 12; + g = c->xyz2rgb.matrix[1][0] * x + + c->xyz2rgb.matrix[1][1] * y + + c->xyz2rgb.matrix[1][2] * z >> 12; + b = c->xyz2rgb.matrix[2][0] * x + + c->xyz2rgb.matrix[2][1] * y + + c->xyz2rgb.matrix[2][2] * z >> 12; // limit values to 16-bit depth r = av_clip_uint16(r); @@ -781,13 +783,13 @@ void ff_xyz12Torgb48(const SwsInternal *c, uint8_t *dst, int dst_stride, // convert from sRGBlinear to RGB and scale from 12bit to 16bit if (desc->flags & AV_PIX_FMT_FLAG_BE) { - AV_WB16(dst16 + xp + 0, c->rgbgamma[r] << 4); - AV_WB16(dst16 + xp + 1, c->rgbgamma[g] << 4); - AV_WB16(dst16 + xp + 2, c->rgbgamma[b] << 4); + AV_WB16(dst16 + xp + 0, c->xyz2rgb.gamma.rgb[r] << 4); + AV_WB16(dst16 + xp + 1, c->xyz2rgb.gamma.rgb[g] << 4); + AV_WB16(dst16 + xp + 2, c->xyz2rgb.gamma.rgb[b] << 4); } else { - AV_WL16(dst16 + xp + 0, c->rgbgamma[r] << 4); - AV_WL16(dst16 + xp + 1, c->rgbgamma[g] << 4); - AV_WL16(dst16 + xp + 2, c->rgbgamma[b] << 4); + AV_WL16(dst16 + xp + 0, c->xyz2rgb.gamma.rgb[r] << 4); + AV_WL16(dst16 + xp + 1, c->xyz2rgb.gamma.rgb[g] << 4); + AV_WL16(dst16 + xp + 2, c->xyz2rgb.gamma.rgb[b] << 4); } } @@ -818,20 +820,20 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride, b = AV_RL16(src16 + xp + 2); } - r = c->rgbgammainv[r>>4]; - g = c->rgbgammainv[g>>4]; - b = c->rgbgammainv[b>>4]; + r = c->rgb2xyz.gamma.rgb[r >> 4]; + g = c->rgb2xyz.gamma.rgb[g >> 4]; + b = c->rgb2xyz.gamma.rgb[b >> 4]; // convert from sRGBlinear to XYZlinear - x = c->rgb2xyz_matrix[0][0] * r + - c->rgb2xyz_matrix[0][1] * g + - c->rgb2xyz_matrix[0][2] * b >> 12; - y = c->rgb2xyz_matrix[1][0] * r + - c->rgb2xyz_matrix[1][1] * g + - c->rgb2xyz_matrix[1][2] * b >> 12; - z = c->rgb2xyz_matrix[2][0] * r + - c->rgb2xyz_matrix[2][1] * g + - c->rgb2xyz_matrix[2][2] * b >> 12; + x = c->rgb2xyz.matrix[0][0] * r + + c->rgb2xyz.matrix[0][1] * g + + c->rgb2xyz.matrix[0][2] * b >> 12; + y = c->rgb2xyz.matrix[1][0] * r + + c->rgb2xyz.matrix[1][1] * g + + c->rgb2xyz.matrix[1][2] * b >> 12; + z = c->rgb2xyz.matrix[2][0] * r + + c->rgb2xyz.matrix[2][1] * g + + c->rgb2xyz.matrix[2][2] * b >> 12; // limit values to 16-bit depth x = av_clip_uint16(x); @@ -840,13 +842,13 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride, // convert from XYZlinear to X'Y'Z' and scale from 12bit to 16bit if (desc->flags & AV_PIX_FMT_FLAG_BE) { - AV_WB16(dst16 + xp + 0, c->xyzgammainv[x] << 4); - AV_WB16(dst16 + xp + 1, c->xyzgammainv[y] << 4); - AV_WB16(dst16 + xp + 2, c->xyzgammainv[z] << 4); + AV_WB16(dst16 + xp + 0, c->rgb2xyz.gamma.xyz[x] << 4); + AV_WB16(dst16 + xp + 1, c->rgb2xyz.gamma.xyz[y] << 4); + AV_WB16(dst16 + xp + 2, c->rgb2xyz.gamma.xyz[z] << 4); } else { - AV_WL16(dst16 + xp + 0, c->xyzgammainv[x] << 4); - AV_WL16(dst16 + xp + 1, c->xyzgammainv[y] << 4); - AV_WL16(dst16 + xp + 2, c->xyzgammainv[z] << 4); + AV_WL16(dst16 + xp + 0, c->rgb2xyz.gamma.xyz[x] << 4); + AV_WL16(dst16 + xp + 1, c->rgb2xyz.gamma.xyz[y] << 4); + AV_WL16(dst16 + xp + 2, c->rgb2xyz.gamma.xyz[z] << 4); } } @@ -855,6 +857,11 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride, } } +av_cold void ff_sws_init_xyz2rgb(SwsInternal *c) +{ + c->xyz12Torgb48 = xyz12Torgb48_c; +} + void ff_update_palette(SwsInternal *c, const uint32_t *pal) { for (int i = 0; i < 256; i++) { @@ -1110,7 +1117,7 @@ static int scale_internal(SwsContext *sws, base = srcStride[0] < 0 ? c->xyz_scratch - srcStride[0] * (srcSliceH-1) : c->xyz_scratch; - ff_xyz12Torgb48(c, base, srcStride[0], src2[0], srcStride[0], sws->src_w, srcSliceH); + c->xyz12Torgb48(c, base, srcStride[0], src2[0], srcStride[0], sws->src_w, srcSliceH); src2[0] = base; } diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 5dd65a8d71..107671feb2 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -93,6 +93,16 @@ typedef int (*SwsFunc)(SwsInternal *c, const uint8_t *const src[], const int srcStride[], int srcSliceY, int srcSliceH, uint8_t *const dst[], const int dstStride[]); +typedef struct GammaLuts { + uint16_t *xyz; + uint16_t *rgb; +} GammaLuts; + +typedef struct ColorXform { + GammaLuts gamma; + int16_t matrix[3][3]; +} ColorXform; + /** * Write one line of horizontally scaled data to planar output * without any additional vertical scaling (or point-scaling). @@ -547,12 +557,10 @@ struct SwsInternal { /* pre defined color-spaces gamma */ #define XYZ_GAMMA (2.6) #define RGB_GAMMA (2.2) - uint16_t *xyzgamma; - uint16_t *rgbgamma; - uint16_t *xyzgammainv; - uint16_t *rgbgammainv; - int16_t xyz2rgb_matrix[3][4]; - int16_t rgb2xyz_matrix[3][4]; + void (*xyz12Torgb48)(const SwsInternal *c, uint8_t *dst, int dst_stride, + const uint8_t *src, int src_stride, int w, int h); + ColorXform xyz2rgb; + ColorXform rgb2xyz; /* function pointers for swscale() */ yuv2planar1_fn yuv2plane1; @@ -720,6 +728,8 @@ av_cold void ff_sws_init_range_convert_loongarch(SwsInternal *c); av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c); av_cold void ff_sws_init_range_convert_x86(SwsInternal *c); +av_cold void ff_sws_init_xyz2rgb(SwsInternal *c); + SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c); SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c); SwsFunc ff_yuv2rgb_init_loongarch(SwsInternal *c); @@ -1043,9 +1053,6 @@ void ff_copyPlane(const uint8_t *src, int srcStride, int srcSliceY, int srcSliceH, int width, uint8_t *dst, int dstStride); -void ff_xyz12Torgb48(const SwsInternal *c, uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, int w, int h); - void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int w, int h); diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index 2c791e89fe..7be0690882 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -2685,6 +2685,8 @@ void ff_get_unscaled_swscale(SwsInternal *c) } } + ff_sws_init_xyz2rgb(c); + #if ARCH_PPC ff_get_unscaled_swscale_ppc(c); #elif ARCH_ARM diff --git a/libswscale/utils.c b/libswscale/utils.c index a13d8df7e8..79de0ea9c9 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -721,34 +721,35 @@ static av_cold void init_xyz_tables(void) static int fill_xyztables(SwsInternal *c) { - static const int16_t xyz2rgb_matrix[3][4] = { + static const int16_t xyz2rgb_matrix[3][3] = { {13270, -6295, -2041}, {-3969, 7682, 170}, { 228, -835, 4329} }; - static const int16_t rgb2xyz_matrix[3][4] = { + static const int16_t rgb2xyz_matrix[3][3] = { {1689, 1464, 739}, { 871, 2929, 296}, { 79, 488, 3891} }; - if (c->xyzgamma) + if (c->xyz2rgb.gamma.xyz) return 0; - memcpy(c->xyz2rgb_matrix, xyz2rgb_matrix, sizeof(c->xyz2rgb_matrix)); - memcpy(c->rgb2xyz_matrix, rgb2xyz_matrix, sizeof(c->rgb2xyz_matrix)); + memcpy(c->xyz2rgb.matrix, xyz2rgb_matrix, sizeof(c->xyz2rgb.matrix)); + memcpy(c->rgb2xyz.matrix, rgb2xyz_matrix, sizeof(c->rgb2xyz.matrix)); #if CONFIG_SMALL - c->xyzgamma = av_malloc(sizeof(uint16_t) * 2 * (4096 + 65536)); - if (!c->xyzgamma) + c->xyz2rgb.gamma.xyz = av_malloc(sizeof(uint16_t) * 2 * (4096 + 65536)); + if (!c->xyz2rgb.gamma.xyz) return AVERROR(ENOMEM); - c->rgbgammainv = c->xyzgamma + 4096; - c->rgbgamma = c->rgbgammainv + 4096; - c->xyzgammainv = c->rgbgamma + 65536; - init_xyz_tables(c->xyzgamma, c->xyzgammainv, c->rgbgamma, c->rgbgammainv); + c->rgb2xyz.gamma.rgb = c->xyz2rgb.gamma.xyz + 4096; + c->xyz2rgb.gamma.rgb = c->rgb2xyz.gamma.rgb + 4096; + c->rgb2xyz.gamma.xyz = c->xyz2rgb.gamma.rgb + 65536; + init_xyz_tables(c->xyz2rgb.gamma.xyz, c->rgb2xyz.gamma.xyz, + c->xyz2rgb.gamma.rgb, c->rgb2xyz.gamma.rgb); #else - c->xyzgamma = xyzgamma_tab; - c->rgbgamma = rgbgamma_tab; - c->xyzgammainv = xyzgammainv_tab; - c->rgbgammainv = rgbgammainv_tab; + c->xyz2rgb.gamma.xyz = xyzgamma_tab; + c->xyz2rgb.gamma.rgb = rgbgamma_tab; + c->rgb2xyz.gamma.xyz = xyzgammainv_tab; + c->rgb2xyz.gamma.rgb = rgbgammainv_tab; static AVOnce xyz_init_static_once = AV_ONCE_INIT; ff_thread_once(&xyz_init_static_once, init_xyz_tables); @@ -2312,7 +2313,7 @@ void sws_freeContext(SwsContext *sws) av_freep(&c->gamma); av_freep(&c->inv_gamma); #if CONFIG_SMALL - av_freep(&c->xyzgamma); + av_freep(&c->xyz2rgb.gamma.xyz); #endif av_freep(&c->rgb0_scratch); -- 2.49.1 >From 9a58d4467f600b67fbf1b75e32b8732d2f0aa7f5 Mon Sep 17 00:00:00 2001 From: Arpad Panyik <[email protected]> Date: Wed, 26 Nov 2025 16:36:13 +0000 Subject: [PATCH 2/3] checkasm: Add xyz12Torgb48le test Add checkasm coverage for the XYZ12LE to RGB48LE path via the ctx->xyz12Torgb48 hook. Integrate the test into the build and runner, exercise a variety of widths/heights, compare against the C reference, and benchmark when width is multiple of 4. This improves test coverage for the new function pointer in preparation for architecture-specific implementations in subsequent commits. Signed-off-by: Arpad Panyik <[email protected]> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + tests/checkasm/sw_xyz2rgb.c | 112 ++++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+) create mode 100644 tests/checkasm/sw_xyz2rgb.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 6636bc7774..6d2e2b6e22 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -80,6 +80,7 @@ SWSCALEOBJS += sw_gbrp.o \ sw_range_convert.o \ sw_rgb.o \ sw_scale.o \ + sw_xyz2rgb.o \ sw_yuv2rgb.o \ sw_yuv2yuv.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 20d8f19757..e9251bfb35 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -324,6 +324,7 @@ static const struct { { "sw_range_convert", checkasm_check_sw_range_convert }, { "sw_rgb", checkasm_check_sw_rgb }, { "sw_scale", checkasm_check_sw_scale }, + { "sw_xyz2rgb", checkasm_check_sw_xyz2rgb }, { "sw_yuv2rgb", checkasm_check_sw_yuv2rgb }, { "sw_yuv2yuv", checkasm_check_sw_yuv2yuv }, { "sw_ops", checkasm_check_sw_ops }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 45cd23cac4..9935b37d7d 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -138,6 +138,7 @@ void checkasm_check_sw_gbrp(void); void checkasm_check_sw_range_convert(void); void checkasm_check_sw_rgb(void); void checkasm_check_sw_scale(void); +void checkasm_check_sw_xyz2rgb(void); void checkasm_check_sw_yuv2rgb(void); void checkasm_check_sw_yuv2yuv(void); void checkasm_check_sw_ops(void); diff --git a/tests/checkasm/sw_xyz2rgb.c b/tests/checkasm/sw_xyz2rgb.c new file mode 100644 index 0000000000..82e6f94983 --- /dev/null +++ b/tests/checkasm/sw_xyz2rgb.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2025 Arpad Panyik <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <string.h> + +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" +#include "libavutil/pixdesc.h" +#include "libavutil/pixfmt.h" + +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" + +#include "checkasm.h" + +#define NUM_LINES 4 +#define MAX_LINE_SIZE 1920 + +#define randomize_buffers(buf, size) \ + do { \ + for (int j = 0; j < size; j += 4) \ + AV_WN32(buf + j, rnd()); \ + } while (0) + +static void check_xyz12Torgb48le(void) +{ + const int src_pix_fmt = AV_PIX_FMT_XYZ12LE; + const int dst_pix_fmt = AV_PIX_FMT_RGB48LE; + const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt); + const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt); + + static const int input_sizes[] = {1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 21, 31, 32, + 64, 128, 256, 512, 1024, MAX_LINE_SIZE}; + + declare_func(void, const SwsContext *, uint8_t *, int, const uint8_t *, + int, int, int); + + LOCAL_ALIGNED_8(uint8_t, src, [6 * MAX_LINE_SIZE * NUM_LINES]); + LOCAL_ALIGNED_8(uint8_t, dst_ref, [6 * MAX_LINE_SIZE * NUM_LINES]); + LOCAL_ALIGNED_8(uint8_t, dst_new, [6 * MAX_LINE_SIZE * NUM_LINES]); + + randomize_buffers(src, MAX_LINE_SIZE * NUM_LINES); + + for (int height = 1; height <= NUM_LINES; height++) { + for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) { + SwsContext *sws; + SwsInternal *c; + int log_level; + int width = input_sizes[isi]; + const int srcStride = 6 * MAX_LINE_SIZE; + const int dstStride = 6 * MAX_LINE_SIZE; + + // Override log level to prevent spamming of the message: + // "No accelerated colorspace conversion found from %s to %s" + log_level = av_log_get_level(); + av_log_set_level(AV_LOG_ERROR); + sws = sws_getContext(width, height, src_pix_fmt, + width, height, dst_pix_fmt, + 0, NULL, NULL, NULL); + av_log_set_level(log_level); + if (!sws) + fail(); + + c = sws_internal(sws); + if (check_func(c->xyz12Torgb48, "%s_%s_%dx%d", src_desc->name, + dst_desc->name, width, height)) { + memset(dst_ref, 0xFF, MAX_LINE_SIZE * NUM_LINES); + memset(dst_new, 0xFF, MAX_LINE_SIZE * NUM_LINES); + + call_ref((const SwsContext*)c, dst_ref, dstStride, src, + srcStride, width, height); + call_new((const SwsContext*)c, dst_new, dstStride, src, + srcStride, width, height); + + if (memcmp(dst_ref, dst_new, MAX_LINE_SIZE * NUM_LINES)) + fail(); + + if (!(width & 3) && height == NUM_LINES) { + bench_new((const SwsContext*)c, dst_new, dstStride, + src, srcStride, width, height); + } + } + sws_freeContext(sws); + } + } +} + +#undef NUM_LINES +#undef MAX_LINE_SIZE + +void checkasm_check_sw_xyz2rgb(void) +{ + check_xyz12Torgb48le(); + report("xyz12Torgb48le"); +} -- 2.49.1 >From 1a34a8fb38a2c54a46e8c49c006b9a1e026b4c76 Mon Sep 17 00:00:00 2001 From: Arpad Panyik <[email protected]> Date: Wed, 26 Nov 2025 16:38:16 +0000 Subject: [PATCH 3/3] swscale: Add AArch64 Neon path for xyz12Torgb48 LE Add optimized Neon code path for the little endian case of the xyz12Torgb48 function. The innermost loop processes the data in 4x2 pixel blocks using software gathers with the matrix multiplication and clipping done by Neon. Relative runtime of micro benchmarks after this patch on some Cortex and Neoverse CPU cores: xyz12le_rgb48le X1 X3 X4 X925 V2 16x4_neon: 2.39x 4.04x 2.84x 3.27x 3.02x 32x4_neon: 2.42x 3.34x 2.98x 3.34x 2.97x 64x4_neon: 2.36x 3.12x 2.99x 3.32x 2.95x 128x4_neon: 2.36x 3.08x 3.01x 3.34x 2.95x 256x4_neon: 2.33x 3.08x 3.08x 3.41x 2.95x 512x4_neon: 2.30x 3.04x 3.00x 3.54x 2.88x 1024x4_neon: 2.28x 3.01x 2.88x 3.55x 3.07x 1920x4_neon: 2.27x 2.94x 2.79x 3.53x 2.86x xyz12le_rgb48le A76 A78 A715 A720 A725 16x4_neon: 2.36x 2.20x 2.32x 2.99x 2.98x 32x4_neon: 2.40x 2.25x 2.37x 2.99x 3.02x 64x4_neon: 2.37x 2.22x 2.34x 2.97x 3.03x 128x4_neon: 2.35x 2.23x 2.33x 2.93x 3.00x 256x4_neon: 2.39x 2.23x 2.35x 2.88x 2.92x 512x4_neon: 2.39x 2.21x 2.32x 2.81x 2.89x 1024x4_neon: 2.37x 2.18x 2.31x 2.79x 2.89x 1920x4_neon: 2.37x 2.17x 2.30x 2.77x 2.86x xyz12le_rgb48le A55 A510 A520 16x4_neon: 1.98x 1.96x 2.23x 32x4_neon: 2.03x 1.96x 2.20x 64x4_neon: 2.01x 1.95x 2.24x 128x4_neon: 1.99x 1.91x 2.22x 256x4_neon: 1.92x 1.86x 2.22x 512x4_neon: 1.89x 1.80x 2.19x 1024x4_neon: 1.90x 1.80x 2.19x 1920x4_neon: 1.91x 1.79x 2.20x Signed-off-by: Arpad Panyik <[email protected]> --- libswscale/aarch64/Makefile | 1 + libswscale/aarch64/swscale.c | 23 + libswscale/aarch64/xyz2rgb_neon.S | 709 ++++++++++++++++++++++++++++++ libswscale/swscale.c | 4 + libswscale/swscale_internal.h | 1 + 5 files changed, 738 insertions(+) create mode 100644 libswscale/aarch64/xyz2rgb_neon.S diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile index 1de8c9c0d6..1c82e34e28 100644 --- a/libswscale/aarch64/Makefile +++ b/libswscale/aarch64/Makefile @@ -8,4 +8,5 @@ NEON-OBJS += aarch64/hscale.o \ aarch64/range_convert_neon.o \ aarch64/rgb2rgb_neon.o \ aarch64/swscale_unscaled_neon.o \ + aarch64/xyz2rgb_neon.o \ aarch64/yuv2rgb_neon.o \ diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 55fff03a5a..80a89f7504 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -22,6 +22,18 @@ #include "libswscale/swscale_internal.h" #include "libavutil/aarch64/cpu.h" +void ff_xyz12Torgb48le_neon_asm(const ColorXform *c, uint8_t *dst, + int dst_stride, const uint8_t *src, + int src_stride, int w, int h); + +static void xyz12Torgb48le_neon(const SwsInternal *c, uint8_t *dst, + int dst_stride, const uint8_t *src, + int src_stride, int w, int h) +{ + return ff_xyz12Torgb48le_neon_asm(&c->xyz2rgb, dst, dst_stride, + src, src_stride, w, h); +} + void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW, const uint8_t *_src, const int16_t *filter, const int32_t *filterPos, int filterSize); @@ -307,6 +319,17 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) } } +av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + if (!(av_pix_fmt_desc_get(c->opts.src_format)->flags & AV_PIX_FMT_FLAG_BE)) { + c->xyz12Torgb48 = xyz12Torgb48le_neon; + } + } +} + av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) { int cpu_flags = av_get_cpu_flags(); diff --git a/libswscale/aarch64/xyz2rgb_neon.S b/libswscale/aarch64/xyz2rgb_neon.S new file mode 100644 index 0000000000..b23903c9eb --- /dev/null +++ b/libswscale/aarch64/xyz2rgb_neon.S @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2025 Arpad Panyik <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#ifndef JUMP_ALIGN +#define JUMP_ALIGN 2 +#endif +#ifndef LOOP_ALIGN +#define LOOP_ALIGN 2 +#endif + +#define GAMMA_XYZ 0 +#define GAMMA_RGB 8 +#define MATRIX_00 16 +#define MATRIX_22 32 + +function ff_xyz12Torgb48le_neon_asm, export=1 +// x0 const ColorXform *c +// x1 uint8_t *dst +// w2 int dst_stride +// x3 const uint8_t *src +// w4 int src_stride +// w5 int w +// w6 int h + + ldp x7, x8, [x0, #(GAMMA_XYZ)] // gamma.xyz, gamma.rgb + ldr q6, [x0, #(MATRIX_00)] // matrix[0][0]..[2][1] + ldr h7, [x0, #(MATRIX_22)] // matrix[2][2]; > 0 + add w9, w5, w5, lsl #1 // w * 3 + add x17, x3, w4, sxtw // sr2 = src + src_stride + add x16, x1, w2, sxtw // ds2 = dst + dst_stride + sub w4, w4, w9 // src_stride - w * 3 + sub w2, w2, w9 // dst_stride - w * 3 + abs v6.8h, v6.8h // abs(matrix[0][0]..[2][1]) + sbfiz x4, x4, #1, #32 // src_stride * 2 - w * 6 + sbfiz x2, x2, #1, #32 // dst_stride * 2 - w * 6 + + subs w6, w6, #2 + b.lt 6f // h < 2 + + stp x19, x20, [sp, #-64]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + str x25, [sp, #48] + + .align LOOP_ALIGN +1: // yp loop for 2x4 pixels + subs w0, w5, #4 + b.lt 3f // w < 4 + + .align LOOP_ALIGN +2: // xp loop for 2x4 pixels: XYZ0[0..3], XYZ1[0..3] + ldp x9, x10, [x3] // x9 = X0[0] Y0[0] Z0[0] X0[1], x10 = Y0[1] Z0[1] X0[2] Y0[2] + ldr x11, [x3, #16] // x11 = Z0[2] X0[3] Y0[3] Z0[3] + add x3, x3, #24 + ubfx x12, x9, #4, #12 // X0[0] >> 4 + lsr x13, x9, #52 // X0[1] >> 4 + ubfx x14, x10, #36, #12 // X0[2] >> 4 + ubfx x15, x11, #20, #12 // X0[3] >> 4 + + ldp x19, x20, [x17] // x19 = X1[0] Y1[0] Z1[0] X1[1], x20 = Y1[1] Z1[1] X1[2] Y1[2] + ldr x21, [x17, #16] // x21 = Z1[2] X1[3] Y1[3] Z1[3] + add x17, x17, #24 + ubfx x22, x19, #4, #12 // X1[0] >> 4 + lsr x23, x19, #52 // X1[1] >> 4 + ubfx x24, x20, #36, #12 // X1[2] >> 4 + ubfx x25, x21, #20, #12 // X1[3] >> 4 + + ldr h0, [x7, x12, lsl #1] // gamma.xyz[X0[0] >> 4] + ubfx x12, x9, #20, #12 // Y0[0] >> 4 + ldr h16, [x7, x13, lsl #1] // gamma.xyz[X0[1] >> 4] + ubfx x13, x10, #4, #12 // Y0[1] >> 4 + ldr h17, [x7, x14, lsl #1] // gamma.xyz[X0[2] >> 4] + lsr x14, x10, #52 // Y0[2] >> 4 + ldr h18, [x7, x15, lsl #1] // gamma.xyz[X0[3] >> 4] + ubfx x15, x11, #36, #12 // Y0[3] >> 4 + + ldr h20, [x7, x22, lsl #1] // gamma.xyz[X1[0] >> 4] + ubfx x22, x19, #20, #12 // Y1[0] >> 4 + ldr h26, [x7, x23, lsl #1] // gamma.xyz[X1[1] >> 4] + ubfx x23, x20, #4, #12 // Y1[1] >> 4 + ldr h27, [x7, x24, lsl #1] // gamma.xyz[X1[2] >> 4] + lsr x24, x20, #52 // Y1[2] >> 4 + ldr h28, [x7, x25, lsl #1] // gamma.xyz[X1[3] >> 4] + ubfx x25, x21, #36, #12 // Y1[3] >> 4 + + mov v0.h[1], v16.h[0] // v0.4h = gamma.xyz[X0[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[X0[2..3] >> 4] + mov v0.s[1], v17.s[0] // v0.4h = gamma.xyz[X0[0..3] >> 4] + ldr h1, [x7, x12, lsl #1] // gamma.xyz[Y0[0] >> 4] + umull v3.4s, v0.4h, v6.h[0] // R0[0..3] = gamma.xyz[X0[0..3] >> 4] * matrix[0][0] + umull v5.4s, v0.4h, v6.h[6] // B0[0..3] = gamma.xyz[X0[0..3] >> 4] * matrix[2][0] + ubfx x12, x9, #36, #12 // Z0[0] >> 4 + ldr h16, [x7, x13, lsl #1] // gamma.xyz[Y0[1] >> 4] + + mov v20.h[1], v26.h[0] // v20.4h = gamma.xyz[X1[0..1] >> 4] + mov v27.h[1], v28.h[0] // v27.4h = gamma.xyz[X1[2..3] >> 4] + mov v20.s[1], v27.s[0] // v20.4h = gamma.xyz[X1[0..3] >> 4] + ldr h21, [x7, x22, lsl #1] // gamma.xyz[Y1[0] >> 4] + umull v23.4s, v20.4h, v6.h[0] // R1[0..3] = gamma.xyz[X1[0..3] >> 4] * matrix[0][0] + umull v25.4s, v20.4h, v6.h[6] // B1[0..3] = gamma.xyz[X1[0..3] >> 4] * matrix[2][0] + ubfx x22, x19, #36, #12 // Z1[0] >> 4 + ldr h26, [x7, x23, lsl #1] // gamma.xyz[Y1[1] >> 4] + + ubfx x13, x10, #20, #12 // Z0[1] >> 4 + ldr h17, [x7, x14, lsl #1] // gamma.xyz[Y0[2] >> 4] + ubfx x14, x11, #4, #12 // Z0[2] >> 4 + ldr h18, [x7, x15, lsl #1] // gamma.xyz[Y0[3] >> 4] + lsr x15, x11, #52 // Z0[3] >> 4 + mov v1.h[1], v16.h[0] // v1.4h = gamma.xyz[Y0[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[Y0[2..3] >> 4] + mov v1.s[1], v17.s[0] // v1.4h = gamma.xyz[Y0[0..3] >> 4] + + ubfx x23, x20, #20, #12 // Z1[1] >> 4 + ldr h27, [x7, x24, lsl #1] // gamma.xyz[Y1[2] >> 4] + ubfx x24, x21, #4, #12 // Z1[2] >> 4 + ldr h28, [x7, x25, lsl #1] // gamma.xyz[Y1[3] >> 4] + umull v4.4s, v1.4h, v6.h[4] // G0[0..3] = gamma.xyz[Y0[0..3] >> 4] * matrix[1][1] + umlsl v3.4s, v1.4h, v6.h[1] // R0[0..3] -= gamma.xyz[Y0[0..3] >> 4] * matrix[0][1] + + lsr x25, x21, #52 // Z1[3] >> 4 + mov v21.h[1], v26.h[0] // v21.4h = gamma.xyz[Y1[0..1] >> 4] + mov v27.h[1], v28.h[0] // v27.4h = gamma.xyz[Y1[2..3] >> 4] + mov v21.s[1], v27.s[0] // v21.4h = gamma.xyz[Y1[0..3] >> 4] + umlsl v4.4s, v0.4h, v6.h[3] // G0[0..3] -= gamma.xyz[X0[0..3] >> 4] * matrix[1][0] + umlsl v5.4s, v1.4h, v6.h[7] // B0[0..3] -= gamma.xyz[Y0[0..3] >> 4] * matrix[2][1] + + ldr h2, [x7, x12, lsl #1] // gamma.xyz[Z0[0] >> 4] + ldr h16, [x7, x13, lsl #1] // gamma.xyz[Z0[1] >> 4] + ldr h17, [x7, x14, lsl #1] // gamma.xyz[Z0[2] >> 4] + ldr h18, [x7, x15, lsl #1] // gamma.xyz[Z0[3] >> 4] + umull v24.4s, v21.4h, v6.h[4] // G1[0..3] = gamma.xyz[Y1[0..3] >> 4] * matrix[1][1] + umlsl v23.4s, v21.4h, v6.h[1] // R1[0..3] -= gamma.xyz[Y1[0..3] >> 4] * matrix[0][1] + + mov v2.h[1], v16.h[0] // v2.4h = gamma.xyz[Z0[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[Z0[2..3] >> 4] + mov v2.s[1], v17.s[0] // v2.4h = gamma.xyz[Z0[0..3] >> 4] + umlsl v24.4s, v20.4h, v6.h[3] // G1[0..3] -= gamma.xyz[X1[0..3] >> 4] * matrix[1][0] + umlsl v25.4s, v21.4h, v6.h[7] // B1[0..3] -= gamma.xyz[Y1[0..3] >> 4] * matrix[2][1] + + ldr h22, [x7, x22, lsl #1] // gamma.xyz[Z1[0] >> 4] + ldr h26, [x7, x23, lsl #1] // gamma.xyz[Z1[1] >> 4] + ldr h27, [x7, x24, lsl #1] // gamma.xyz[Z1[2] >> 4] + ldr h28, [x7, x25, lsl #1] // gamma.xyz[Z1[3] >> 4] + mov v22.h[1], v26.h[0] // v22.4h = gamma.xyz[Z1[0..1] >> 4] + mov v27.h[1], v28.h[0] // v27.4h = gamma.xyz[Z1[2..3] >> 4] + mov v22.s[1], v27.s[0] // v22.4h = gamma.xyz[Z1[0..3] >> 4] + + umlsl v3.4s, v2.4h, v6.h[2] // R0[0..3] -= gamma.xyz[Z0[0..3] >> 4] * matrix[0][2] + sqshrun v3.4h, v3.4s, #12 // clip(R0[0..3] >> 12) + umlal v4.4s, v2.4h, v6.h[5] // G0[0..3] += gamma.xyz[Z0[0..3] >> 4] * matrix[1][2] + sqshrun v4.4h, v4.4s, #12 // clip(G0[0..3] >> 12) + umov w9, v3.h[0] // clip(R0[0] >> 12) + umov w10, v4.h[1] // clip(G0[1] >> 12) + umlal v5.4s, v2.4h, v7.h[0] // B0[0..3] += gamma.xyz[Z0[0..3] >> 4] * matrix[2][2] + sqshrun v5.4h, v5.4s, #12 // clip(B0[0..3] >> 12) + + umlsl v23.4s, v22.4h, v6.h[2] // R1[0..3] -= gamma.xyz[Z1[0..3] >> 4] * matrix[0][2] + sqshrun v23.4h, v23.4s, #12 // clip(R1[0..3] >> 12) + umlal v24.4s, v22.4h, v6.h[5] // G1[0..3] += gamma.xyz[Z1[0..3] >> 4] * matrix[1][2] + sqshrun v24.4h, v24.4s, #12 // clip(G1[0..3] >> 12) + umov w19, v23.h[0] // clip(R1[0] >> 12) + umov w20, v24.h[1] // clip(G1[1] >> 12) + umlal v25.4s, v22.4h, v7.h[0] // B1[0..3] += gamma.xyz[Z1[0..3] >> 4] * matrix[2][2] + sqshrun v25.4h, v25.4s, #12 // clip(B1[0..3] >> 12) + + umov w11, v5.h[2] // clip(B0[2] >> 12) + umov w12, v4.h[0] // clip(G0[0] >> 12) + ldrh w9, [x8, x9, lsl #1] // R0[0] = gamma.rgb[clip(R0[0] >> 12)] + lsl x9, x9, #4 // R0[0] << 4 + umov w13, v5.h[1] // clip(B0[1] >> 12) + ldrh w10, [x8, x10, lsl #1] // G0[1] = gamma.rgb[clip(G0[1] >> 12)] + lsl x10, x10, #4 // G0[1] << 4 + + umov w21, v25.h[2] // clip(B1[2] >> 12) + umov w22, v24.h[0] // clip(G1[0] >> 12) + ldrh w19, [x8, x19, lsl #1] // R1[0] = gamma.rgb[clip(R1[0] >> 12)] + lsl x19, x19, #4 // R1[0] << 4 + umov w23, v25.h[1] // clip(B1[1] >> 12) + ldrh w20, [x8, x20, lsl #1] // G1[1] = gamma.rgb[clip(G1[1] >> 12)] + lsl x20, x20, #4 // G1[1] << 4 + + umov w14, v3.h[3] // clip(R0[3] >> 12) + ldrh w11, [x8, x11, lsl #1] // B0[2] = gamma.rgb[clip(B0[2] >> 12)] + lsl x11, x11, #4 // B0[2] << 4 + umov w15, v5.h[0] // clip(B0[0] >> 12) + ldrh w12, [x8, x12, lsl #1] // G0[0] = gamma.rgb[clip(G0[0] >> 12)] + orr x9, x9, x12, lsl #20 // R0[0] << 4, G0[0] << 4 + umov w12, v3.h[2] // clip(R0[2] >> 12) + ldrh w13, [x8, x13, lsl #1] // B0[1] = gamma.rgb[clip(B0[1] >> 12)] + + umov w24, v23.h[3] // clip(R1[3] >> 12) + ldrh w21, [x8, x21, lsl #1] // B1[2] = gamma.rgb[clip(B1[2] >> 12)] + lsl x21, x21, #4 // B1[2] << 4 + umov w25, v25.h[0] // clip(B1[0] >> 12) + ldrh w22, [x8, x22, lsl #1] // G1[0] = gamma.rgb[clip(G1[0] >> 12)] + orr x19, x19, x22, lsl #20 // R1[0] << 4, G1[0] << 4 + umov w22, v23.h[2] // clip(R1[2] >> 12) + ldrh w23, [x8, x23, lsl #1] // B1[1] = gamma.rgb[clip(B1[1] >> 12)] + + orr x10, x10, x13, lsl #20 // G0[1] << 4, B0[1] << 4 + umov w13, v4.h[3] // clip(G0[3] >> 12) + ldrh w14, [x8, x14, lsl #1] // R0[3] = gamma.rgb[clip(R0[3] >> 12)] + orr x11, x11, x14, lsl #20 // B0[2] << 4, R0[3] << 4 + umov w14, v3.h[1] // clip(R0[1] >> 12) + ldrh w15, [x8, x15, lsl #1] // B0[0] = gamma.rgb[clip(B0[0] >> 12)] + orr x9, x9, x15, lsl #36 // R0[0] << 4, G0[0] << 4, B0[0] << 4 + umov w15, v4.h[2] // clip(G0[2] >> 12) + + orr x20, x20, x23, lsl #20 // G1[1] << 4, B1[1] << 4 + umov w23, v24.h[3] // clip(G1[3] >> 12) + ldrh w24, [x8, x24, lsl #1] // R1[3] = gamma.rgb[clip(R1[3] >> 12)] + orr x21, x21, x24, lsl #20 // B1[2] << 4, R1[3] << 4 + umov w24, v23.h[1] // clip(R1[1] >> 12) + ldrh w25, [x8, x25, lsl #1] // B1[0] = gamma.rgb[clip(B1[0] >> 12)] + orr x19, x19, x25, lsl #36 // R1[0] << 4, G1[0] << 4, B1[0] << 4 + umov w25, v24.h[2] // clip(G1[2] >> 12) + + ldrh w12, [x8, x12, lsl #1] // R0[2] = gamma.rgb[clip(R0[2] >> 12)] + orr x10, x10, x12, lsl #36 // G0[1] << 4, B0[1] << 4, R0[2] << 4 + umov w12, v5.h[3] // clip(B0[3] >> 12) + ldrh w13, [x8, x13, lsl #1] // G0[3] = gamma.rgb[clip(G0[3] >> 12)] + orr x11, x11, x13, lsl #36 // B0[2] << 4, R0[3] << 4, G0[3] << 4 + ldrh w14, [x8, x14, lsl #1] // R0[1] = gamma.rgb[clip(R0[1] >> 12)] + orr x9, x9, x14, lsl #52 // x9 = R0[0] << 4, G0[0] << 4, B0[0] << 4, R0[1] << 4 + ldrh w15, [x8, x15, lsl #1] // G0[2] = gamma.rgb[clip(G0[2] >> 12)] + orr x10, x10, x15, lsl #52 // x10 = G0[1] << 4, B0[1] << 4, R0[2] << 4, G0[2] << 4 + ldrh w12, [x8, x12, lsl #1] // B0[3] = gamma.rgb[clip(B0[3] >> 12)] + orr x11, x11, x12, lsl #52 // x11 = B0[2] << 4, R0[3] << 4, G0[3] << 4, B0[3] << 4 + stp x9, x10, [x1] + str x11, [x1, #16] + + ldrh w22, [x8, x22, lsl #1] // R1[2] = gamma.rgb[clip(R1[2] >> 12)] + orr x20, x20, x22, lsl #36 // G1[1] << 4, B1[1] << 4, R1[2] << 4 + umov w22, v25.h[3] // clip(B1[3] >> 12) + ldrh w23, [x8, x23, lsl #1] // G1[3] = gamma.rgb[clip(G1[3] >> 12)] + orr x21, x21, x23, lsl #36 // B1[2] << 4, R1[3] << 4, G1[3] << 4 + ldrh w24, [x8, x24, lsl #1] // R1[1] = gamma.rgb[clip(R1[1] >> 12)] + orr x19, x19, x24, lsl #52 // x19 = R1[0] << 4, G1[0] << 4, B1[0] << 4, R1[1] << 4 + ldrh w25, [x8, x25, lsl #1] // G1[2] = gamma.rgb[clip(G1[2] >> 12)] + orr x20, x20, x25, lsl #52 // x20 = G1[1] << 4, B1[1] << 4, R1[2] << 4, G1[2] << 4 + ldrh w22, [x8, x22, lsl #1] // B1[3] = gamma.rgb[clip(B1[3] >> 12)] + orr x21, x21, x22, lsl #52 // x21 = B1[2] << 4, R1[3] << 4, G1[3] << 4, B1[3] << 4 + stp x19, x20, [x16] + str x21, [x16, #16] + + add x1, x1, #24 + add x16, x16, #24 + + subs w0, w0, #4 + b.ge 2b + + .align JUMP_ALIGN +3: + tst w5, #3 + b.eq 5f // no residual pixels; (w & 3) == 0 + + ldr w10, [x3] // w10 = X0[0] Y0[0] + ldrh w11, [x3, #4] // w11 = Z0[0] + add x3, x3, #6 + ldr w20, [x17] // w20 = X1[0] Y1[0] + ldrh w21, [x17, #4] // w21 = Z1[0] + add x17, x17, #6 + ubfx w9, w10, #4, #12 // X0[0] >> 4 + ubfx w10, w10, #20, #12 // Y0[0] >> 4 + lsr w11, w11, #4 // Z0[0] >> 4 + ldr h0, [x7, x9, lsl #1] // v0.4h = gamma.xyz[X0[0] >> 4] + ldr h1, [x7, x10, lsl #1] // v1.4h = gamma.xyz[Y0[0] >> 4] + ldr h2, [x7, x11, lsl #1] // v2.4h = gamma.xyz[Z0[0] >> 4] + ubfx w19, w20, #4, #12 // X1[0] >> 4 + ubfx w20, w20, #20, #12 // Y1[0] >> 4 + lsr w21, w21, #4 // Z1[0] >> 4 + ldr h20, [x7, x19, lsl #1] // v20.4h = gamma.xyz[X1[0] >> 4] + ldr h21, [x7, x20, lsl #1] // v21.4h = gamma.xyz[Y1[0] >> 4] + ldr h22, [x7, x21, lsl #1] // v22.4h = gamma.xyz[Z1[0] >> 4] + + cmp w0, #-2 + b.lt 4f // (w & 3) == 1 + + ldr w10, [x3] // w10 = X0[1] Y0[1] + ldrh w11, [x3, #4] // w11 = Z0[1] + add x3, x3, #6 + ldr w20, [x17] // w20 = X1[1] Y1[1] + ldrh w21, [x17, #4] // w21 = Z1[1] + add x17, x17, #6 + ubfx w9, w10, #4, #12 // X0[1] >> 4 + ubfx w10, w10, #20, #12 // Y0[1] >> 4 + lsr w11, w11, #4 // Z0[1] >> 4 + ldr h16, [x7, x9, lsl #1] // gamma.xyz[X0[1] >> 4] + ldr h17, [x7, x10, lsl #1] // gamma.xyz[Y0[1] >> 4] + ldr h18, [x7, x11, lsl #1] // gamma.xyz[Z0[1] >> 4] + ubfx w19, w20, #4, #12 // X1[1] >> 4 + ubfx w20, w20, #20, #12 // Y1[1] >> 4 + lsr w21, w21, #4 // Z1[1] >> 4 + ldr h23, [x7, x19, lsl #1] // gamma.xyz[X1[1] >> 4] + ldr h24, [x7, x20, lsl #1] // gamma.xyz[Y1[1] >> 4] + ldr h25, [x7, x21, lsl #1] // gamma.xyz[Z1[1] >> 4] + mov v0.h[1], v16.h[0] // v0.4h = gamma.xyz[X0[0..1] >> 4] + mov v1.h[1], v17.h[0] // v1.4h = gamma.xyz[Y0[0..1] >> 4] + mov v2.h[1], v18.h[0] // v2.4h = gamma.xyz[Z0[0..1] >> 4] + mov v20.h[1], v23.h[0] // v20.4h = gamma.xyz[X1[0..1] >> 4] + mov v21.h[1], v24.h[0] // v21.4h = gamma.xyz[Y1[0..1] >> 4] + mov v22.h[1], v25.h[0] // v22.4h = gamma.xyz[Z1[0..1] >> 4] + + b.le 4f // (w & 3) == 2 + + ldr w10, [x3] // w10 = X0[2] Y0[2] + ldrh w11, [x3, #4] // w11 = Z0[2] + add x3, x3, #6 + ldr w20, [x17] // w20 = X1[2] Y1[2] + ldrh w21, [x17, #4] // w21 = Z1[2] + add x17, x17, #6 + ubfx w9, w10, #4, #12 // X0[2] >> 4 + ubfx w10, w10, #20, #12 // Y0[2] >> 4 + lsr w11, w11, #4 // Z0[2] >> 4 + ldr h16, [x7, x9, lsl #1] // gamma.xyz[X0[2] >> 4] + ldr h17, [x7, x10, lsl #1] // gamma.xyz[Y0[2] >> 4] + ldr h18, [x7, x11, lsl #1] // gamma.xyz[Z0[2] >> 4] + ubfx w19, w20, #4, #12 // X1[2] >> 4 + ubfx w20, w20, #20, #12 // Y1[2] >> 4 + lsr w21, w21, #4 // Z1[2] >> 4 + ldr h23, [x7, x19, lsl #1] // gamma.xyz[X1[2] >> 4] + ldr h24, [x7, x20, lsl #1] // gamma.xyz[Y1[2] >> 4] + ldr h25, [x7, x21, lsl #1] // gamma.xyz[Z1[2] >> 4] + mov v0.h[2], v16.h[0] // v0.4h = gamma.xyz[X0[0..2] >> 4] + mov v1.h[2], v17.h[0] // v1.4h = gamma.xyz[Y0[0..2] >> 4] + mov v2.h[2], v18.h[0] // v2.4h = gamma.xyz[Z0[0..2] >> 4] + mov v20.h[2], v23.h[0] // v20.4h = gamma.xyz[X1[0..2] >> 4] + mov v21.h[2], v24.h[0] // v21.4h = gamma.xyz[Y1[0..2] >> 4] + mov v22.h[2], v25.h[0] // v22.4h = gamma.xyz[Z1[0..2] >> 4] + + .align JUMP_ALIGN +4: + umull v3.4s, v0.4h, v6.h[0] // R0[0..2] = gamma.xyz[X0[0..2] >> 4] * matrix[0][0] + umull v5.4s, v0.4h, v6.h[6] // B0[0..2] = gamma.xyz[X0[0..2] >> 4] * matrix[2][0] + + umull v23.4s, v20.4h, v6.h[0] // R1[0..2] = gamma.xyz[X1[0..2] >> 4] * matrix[0][0] + umull v25.4s, v20.4h, v6.h[6] // B1[0..2] = gamma.xyz[X1[0..2] >> 4] * matrix[2][0] + + umull v4.4s, v1.4h, v6.h[4] // G0[0..2] = gamma.xyz[Y0[0..2] >> 4] * matrix[1][1] + umlsl v3.4s, v1.4h, v6.h[1] // R0[0..2] -= gamma.xyz[Y0[0..2] >> 4] * matrix[0][1] + umlsl v4.4s, v0.4h, v6.h[3] // G0[0..2] -= gamma.xyz[X0[0..2] >> 4] * matrix[1][0] + umlsl v5.4s, v1.4h, v6.h[7] // B0[0..2] -= gamma.xyz[Y0[0..2] >> 4] * matrix[2][1] + + umull v24.4s, v21.4h, v6.h[4] // G1[0..2] = gamma.xyz[Y1[0..2] >> 4] * matrix[1][1] + umlsl v23.4s, v21.4h, v6.h[1] // R1[0..2] -= gamma.xyz[Y1[0..2] >> 4] * matrix[0][1] + umlsl v24.4s, v20.4h, v6.h[3] // G1[0..2] -= gamma.xyz[X1[0..2] >> 4] * matrix[1][0] + umlsl v25.4s, v21.4h, v6.h[7] // B1[0..2] -= gamma.xyz[Y1[0..2] >> 4] * matrix[2][1] + + umlsl v3.4s, v2.4h, v6.h[2] // R0[0..2] -= gamma.xyz[Z0[0..2] >> 4] * matrix[0][2] + sqshrun v3.4h, v3.4s, #12 // clip(R0[0..2] >> 12) + umlal v4.4s, v2.4h, v6.h[5] // G0[0..2] += gamma.xyz[Z0[0..2] >> 4] * matrix[1][2] + sqshrun v4.4h, v4.4s, #12 // clip(G0[0..2] >> 12) + umlal v5.4s, v2.4h, v7.h[0] // B0[0..2] += gamma.xyz[Z0[0..2] >> 4] * matrix[2][2] + sqshrun v5.4h, v5.4s, #12 // clip(B0[0..2] >> 12) + + umlsl v23.4s, v22.4h, v6.h[2] // R1[0..2] -= gamma.xyz[Z1[0..2] >> 4] * matrix[0][2] + sqshrun v23.4h, v23.4s, #12 // clip(R1[0..2] >> 12) + umlal v24.4s, v22.4h, v6.h[5] // G1[0..2] += gamma.xyz[Z1[0..2] >> 4] * matrix[1][2] + sqshrun v24.4h, v24.4s, #12 // clip(G1[0..2] >> 12) + umlal v25.4s, v22.4h, v7.h[0] // B1[0..2] += gamma.xyz[Z1[0..2] >> 4] * matrix[2][2] + sqshrun v25.4h, v25.4s, #12 // clip(B1[0..2] >> 12) + + umov w9, v3.h[0] // clip(R0[0] >> 12) + umov w10, v4.h[0] // clip(G0[0] >> 12) + umov w11, v5.h[0] // clip(B0[0] >> 12) + ldrh w9, [x8, x9, lsl #1] // R0[0] = gamma.rgb[clip(R0[0] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G0[0] = gamma.rgb[clip(G0[0] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B0[0] = gamma.rgb[clip(B0[0] >> 12)] + umov w19, v23.h[0] // clip(R1[0] >> 12) + umov w20, v24.h[0] // clip(G1[0] >> 12) + umov w21, v25.h[0] // clip(B1[0] >> 12) + ldrh w19, [x8, x19, lsl #1] // R1[0] = gamma.rgb[clip(R1[0] >> 12)] + ldrh w20, [x8, x20, lsl #1] // G1[0] = gamma.rgb[clip(G1[0] >> 12)] + ldrh w21, [x8, x21, lsl #1] // B1[0] = gamma.rgb[clip(B1[0] >> 12)] + lsl w9, w9, #4 // w9 = R0[0] << 4 + lsl w10, w10, #4 // w10 = G0[0] << 4 + lsl w11, w11, #4 // w11 = B0[0] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + lsl w19, w19, #4 // w19 = R1[0] << 4 + lsl w20, w20, #4 // w20 = G1[0] << 4 + lsl w21, w21, #4 // w21 = B1[0] << 4 + strh w19, [x16] + strh w20, [x16, #2] + strh w21, [x16, #4] + add x1, x1, #6 + add x16, x16, #6 + + cmp w0, #-2 + b.lt 5f // (w & 3) == 1 + + umov w9, v3.h[1] // clip(R0[1] >> 12) + umov w10, v4.h[1] // clip(G0[1] >> 12) + umov w11, v5.h[1] // clip(B0[1] >> 12) + ldrh w9, [x8, x9, lsl #1] // R0[1] = gamma.rgb[clip(R0[1] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G0[1] = gamma.rgb[clip(G0[1] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B0[1] = gamma.rgb[clip(B0[1] >> 12)] + umov w19, v23.h[1] // clip(R1[1] >> 12) + umov w20, v24.h[1] // clip(G1[1] >> 12) + umov w21, v25.h[1] // clip(B1[1] >> 12) + ldrh w19, [x8, x19, lsl #1] // R1[1] = gamma.rgb[clip(R1[1] >> 12)] + ldrh w20, [x8, x20, lsl #1] // G1[1] = gamma.rgb[clip(G1[1] >> 12)] + ldrh w21, [x8, x21, lsl #1] // B1[1] = gamma.rgb[clip(B1[1] >> 12)] + lsl w9, w9, #4 // w9 = R0[1] << 4 + lsl w10, w10, #4 // w10 = G0[1] << 4 + lsl w11, w11, #4 // w11 = B0[1] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + lsl w19, w19, #4 // w19 = R1[1] << 4 + lsl w20, w20, #4 // w20 = G1[1] << 4 + lsl w21, w21, #4 // w21 = B1[1] << 4 + strh w19, [x16] + strh w20, [x16, #2] + strh w21, [x16, #4] + add x1, x1, #6 + add x16, x16, #6 + + b.le 5f // (w & 3) == 2 + + umov w9, v3.h[2] // clip(R0[2] >> 12) + umov w10, v4.h[2] // clip(G0[2] >> 12) + umov w11, v5.h[2] // clip(B0[2] >> 12) + ldrh w9, [x8, x9, lsl #1] // R0[2] = gamma.rgb[clip(R0[2] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G0[2] = gamma.rgb[clip(G0[2] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B0[2] = gamma.rgb[clip(B0[2] >> 12)] + umov w19, v23.h[2] // clip(R1[2] >> 12) + umov w20, v24.h[2] // clip(G1[2] >> 12) + umov w21, v25.h[2] // clip(B1[2] >> 12) + ldrh w19, [x8, x19, lsl #1] // R1[2] = gamma.rgb[clip(R1[2] >> 12)] + ldrh w20, [x8, x20, lsl #1] // G1[2] = gamma.rgb[clip(G1[2] >> 12)] + ldrh w21, [x8, x21, lsl #1] // B1[2] = gamma.rgb[clip(B1[2] >> 12)] + lsl w9, w9, #4 // w9 = R0[2] << 4 + lsl w10, w10, #4 // w10 = G0[2] << 4 + lsl w11, w11, #4 // w11 = B0[2] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + lsl w19, w19, #4 // w19 = R1[2] << 4 + lsl w20, w20, #4 // w20 = G1[2] << 4 + lsl w21, w21, #4 // w21 = B1[2] << 4 + strh w19, [x16] + strh w20, [x16, #2] + strh w21, [x16, #4] + add x1, x1, #6 + add x16, x16, #6 + + .align JUMP_ALIGN +5: + add x3, x3, x4 + add x17, x17, x4 + add x1, x1, x2 + add x16, x16, x2 + + subs w6, w6, #2 + b.ge 1b + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldr x25, [sp, #48] + ldp x19, x20, [sp], #64 + + .align JUMP_ALIGN +6: + tbz w6, #0, 10f // even number of lines; (h & 1) == 0 + + subs w0, w5, #4 + b.lt 8f // w < 4 + + .align LOOP_ALIGN +7: // loop for last odd line by 4 pixels: XYZ[0..3] + ldp x9, x10, [x3] // x9 = X[0] Y[0] Z[0] X[1], x10 = Y[1] Z[1] X[2] Y[2] + ldr x11, [x3, #16] // x11 = Z[2] X[3] Y[3] Z[3] + add x3, x3, #24 + + ubfx x12, x9, #4, #12 // X[0] >> 4 + lsr x13, x9, #52 // X[1] >> 4 + ubfx x14, x10, #36, #12 // X[2] >> 4 + ubfx x15, x11, #20, #12 // X[3] >> 4 + + ldr h0, [x7, x12, lsl #1] // gamma.xyz[X[0] >> 4] + ubfx x12, x9, #20, #12 // Y[0] >> 4 + ldr h16, [x7, x13, lsl #1] // gamma.xyz[X[1] >> 4] + ubfx x13, x10, #4, #12 // Y[1] >> 4 + ldr h17, [x7, x14, lsl #1] // gamma.xyz[X[2] >> 4] + lsr x14, x10, #52 // Y[2] >> 4 + ldr h18, [x7, x15, lsl #1] // gamma.xyz[X[3] >> 4] + ubfx x15, x11, #36, #12 // Y[3] >> 4 + mov v0.h[1], v16.h[0] // v0.4h = gamma.xyz[X[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[X[2..3] >> 4] + mov v0.s[1], v17.s[0] // v0.4h = gamma.xyz[X[0..3] >> 4] + + umull v3.4s, v0.4h, v6.h[0] // R[0..3] = gamma.xyz[X[0..3] >> 4] * matrix[0][0] + umull v5.4s, v0.4h, v6.h[6] // B[0..3] = gamma.xyz[X[0..3] >> 4] * matrix[2][0] + + ldr h1, [x7, x12, lsl #1] // gamma.xyz[Y[0] >> 4] + ubfx x12, x9, #36, #12 // Z[0] >> 4 + ldr h16, [x7, x13, lsl #1] // gamma.xyz[Y[1] >> 4] + ubfx x13, x10, #20, #12 // Z[1] >> 4 + ldr h17, [x7, x14, lsl #1] // gamma.xyz[Y[2] >> 4] + ubfx x14, x11, #4, #12 // Z[2] >> 4 + ldr h18, [x7, x15, lsl #1] // gamma.xyz[Y[3] >> 4] + lsr x15, x11, #52 // Z[3] >> 4 + mov v1.h[1], v16.h[0] // v1.4h = gamma.xyz[Y[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[Y[2..3] >> 4] + mov v1.s[1], v17.s[0] // v1.4h = gamma.xyz[Y[0..3] >> 4] + + umull v4.4s, v1.4h, v6.h[4] // G[0..3] = gamma.xyz[Y[0..3] >> 4] * matrix[1][1] + umlsl v3.4s, v1.4h, v6.h[1] // R[0..3] -= gamma.xyz[Y[0..3] >> 4] * matrix[0][1] + umlsl v4.4s, v0.4h, v6.h[3] // G[0..3] -= gamma.xyz[X[0..3] >> 4] * matrix[1][0] + umlsl v5.4s, v1.4h, v6.h[7] // B[0..3] -= gamma.xyz[Y[0..3] >> 4] * matrix[2][1] + + ldr h2, [x7, x12, lsl #1] // gamma.xyz[Z[0] >> 4] + ldr h16, [x7, x13, lsl #1] // gamma.xyz[Z[1] >> 4] + ldr h17, [x7, x14, lsl #1] // gamma.xyz[Z[2] >> 4] + ldr h18, [x7, x15, lsl #1] // gamma.xyz[Z[3] >> 4] + mov v2.h[1], v16.h[0] // v2.4h = gamma.xyz[Z[0..1] >> 4] + mov v17.h[1], v18.h[0] // v17.4h = gamma.xyz[Z[2..3] >> 4] + mov v2.s[1], v17.s[0] // v2.4h = gamma.xyz[Z[0..3] >> 4] + + umlsl v3.4s, v2.4h, v6.h[2] // R[0..3] -= gamma.xyz[Z[0..3] >> 4] * matrix[0][2] + sqshrun v3.4h, v3.4s, #12 // clip(R[0..3] >> 12) + umlal v4.4s, v2.4h, v6.h[5] // G[0..3] += gamma.xyz[Z[0..3] >> 4] * matrix[1][2] + sqshrun v4.4h, v4.4s, #12 // clip(G[0..3] >> 12) + umlal v5.4s, v2.4h, v7.h[0] // B[0..3] += gamma.xyz[Z[0..3] >> 4] * matrix[2][2] + sqshrun v5.4h, v5.4s, #12 // clip(B[0..3] >> 12) + + umov w9, v3.h[0] // clip(R[0] >> 12) + umov w10, v4.h[1] // clip(G[1] >> 12) + umov w11, v5.h[2] // clip(B[2] >> 12) + + umov w12, v4.h[0] // clip(G[0] >> 12) + ldrh w9, [x8, x9, lsl #1] // R[0] = gamma.rgb[clip(R[0] >> 12)] + lsl x9, x9, #4 // R[0] << 4 + umov w13, v5.h[1] // clip(B[1] >> 12) + ldrh w10, [x8, x10, lsl #1] // G[1] = gamma.rgb[clip(G[1] >> 12)] + lsl x10, x10, #4 // G[1] << 4 + umov w14, v3.h[3] // clip(R[3] >> 12) + ldrh w11, [x8, x11, lsl #1] // B[2] = gamma.rgb[clip(B[2] >> 12)] + lsl x11, x11, #4 // B[2] << 4 + + umov w15, v5.h[0] // clip(B[0] >> 12) + ldrh w12, [x8, x12, lsl #1] // G[0] = gamma.rgb[clip(G[0] >> 12)] + orr x9, x9, x12, lsl #20 // R[0] << 4, G[0] << 4 + umov w12, v3.h[2] // clip(R[2] >> 12) + ldrh w13, [x8, x13, lsl #1] // B[1] = gamma.rgb[clip(B[1] >> 12)] + orr x10, x10, x13, lsl #20 // G[1] << 4, B[1] << 4 + umov w13, v4.h[3] // clip(G[3] >> 12) + ldrh w14, [x8, x14, lsl #1] // R[3] = gamma.rgb[clip(R[3] >> 12)] + orr x11, x11, x14, lsl #20 // B[2] << 4, R[3] << 4 + + umov w14, v3.h[1] // clip(R[1] >> 12) + ldrh w15, [x8, x15, lsl #1] // B[0] = gamma.rgb[clip(B[0] >> 12)] + orr x9, x9, x15, lsl #36 // R[0] << 4, G[0] << 4, B[0] << 4 + umov w15, v4.h[2] // clip(G[2] >> 12) + ldrh w12, [x8, x12, lsl #1] // R[2] = gamma.rgb[clip(R[2] >> 12)] + orr x10, x10, x12, lsl #36 // G[1] << 4, B[1] << 4, R[2] << 4 + umov w12, v5.h[3] // clip(B[3] >> 12) + ldrh w13, [x8, x13, lsl #1] // G[3] = gamma.rgb[clip(G[3] >> 12)] + orr x11, x11, x13, lsl #36 // B[2] << 4, R[3] << 4, G[3] << 4 + + ldrh w14, [x8, x14, lsl #1] // R[1] = gamma.rgb[clip(R[1] >> 12)] + orr x9, x9, x14, lsl #52 // x9 = R[0] << 4, G[0] << 4, B[0] << 4, R[1] << 4 + ldrh w15, [x8, x15, lsl #1] // G[2] = gamma.rgb[clip(G[2] >> 12)] + orr x10, x10, x15, lsl #52 // x10 = G[1] << 4, B[1] << 4, R[2] << 4, G[2] << 4 + ldrh w12, [x8, x12, lsl #1] // B[3] = gamma.rgb[clip(B[3] >> 12)] + orr x11, x11, x12, lsl #52 // x11 = B[2] << 4, R[3] << 4, G[3] << 4, B[3] << 4 + + stp x9, x10, [x1] + str x11, [x1, #16] + add x1, x1, #24 + + subs w0, w0, #4 + b.ge 7b + + .align JUMP_ALIGN +8: + tst w5, #3 + b.eq 10f // no residual pixels; (w & 3) == 0 + + ldr w10, [x3] // w10 = X[0] Y[0] + ldrh w11, [x3, #4] // w11 = Z[0] + add x3, x3, #6 + ubfx w9, w10, #4, #12 // X[0] >> 4 + ubfx w10, w10, #20, #12 // Y[0] >> 4 + lsr w11, w11, #4 // Z[0] >> 4 + ldr h0, [x7, x9, lsl #1] // v0.4h = gamma.xyz[X[0] >> 4] + ldr h1, [x7, x10, lsl #1] // v1.4h = gamma.xyz[Y[0] >> 4] + ldr h2, [x7, x11, lsl #1] // v2.4h = gamma.xyz[Z[0] >> 4] + + cmp w0, #-2 + b.lt 9f // (w & 3) == 1 + + ldr w10, [x3] // w10 = X[1] Y[1] + ldrh w11, [x3, #4] // w11 = Z[1] + add x3, x3, #6 + ubfx w9, w10, #4, #12 // X[1] >> 4 + ubfx w10, w10, #20, #12 // Y[1] >> 4 + lsr w11, w11, #4 // Z[1] >> 4 + ldr h16, [x7, x9, lsl #1] // gamma.xyz[X[1] >> 4] + ldr h17, [x7, x10, lsl #1] // gamma.xyz[Y[1] >> 4] + ldr h18, [x7, x11, lsl #1] // gamma.xyz[Z[1] >> 4] + mov v0.h[1], v16.h[0] // v0.4h = gamma.xyz[X[0..1] >> 4] + mov v1.h[1], v17.h[0] // v1.4h = gamma.xyz[Y[0..1] >> 4] + mov v2.h[1], v18.h[0] // v2.4h = gamma.xyz[Z[0..1] >> 4] + + b.le 9f // (w & 3) == 2 + + ldr w10, [x3] // w10 = X[2] Y[2] + ldrh w11, [x3, #4] // w11 = Z[2] + add x3, x3, #6 + ubfx w9, w10, #4, #12 // X[2] >> 4 + ubfx w10, w10, #20, #12 // Y[2] >> 4 + lsr w11, w11, #4 // Z[2] >> 4 + ldr h16, [x7, x9, lsl #1] // gamma.xyz[X[2] >> 4] + ldr h17, [x7, x10, lsl #1] // gamma.xyz[Y[2] >> 4] + ldr h18, [x7, x11, lsl #1] // gamma.xyz[Z[2] >> 4] + mov v0.h[2], v16.h[0] // v0.4h = gamma.xyz[X[0..2] >> 4] + mov v1.h[2], v17.h[0] // v1.4h = gamma.xyz[Y[0..2] >> 4] + mov v2.h[2], v18.h[0] // v2.4h = gamma.xyz[Z[0..2] >> 4] + + .align JUMP_ALIGN +9: + umull v3.4s, v0.4h, v6.h[0] // R[0..2] = gamma.xyz[X[0..2] >> 4] * matrix[0][0] + umull v5.4s, v0.4h, v6.h[6] // B[0..2] = gamma.xyz[X[0..2] >> 4] * matrix[2][0] + + umull v4.4s, v1.4h, v6.h[4] // G[0..2] = gamma.xyz[Y[0..2] >> 4] * matrix[1][1] + umlsl v3.4s, v1.4h, v6.h[1] // R[0..2] -= gamma.xyz[Y[0..2] >> 4] * matrix[0][1] + umlsl v4.4s, v0.4h, v6.h[3] // G[0..2] -= gamma.xyz[X[0..2] >> 4] * matrix[1][0] + umlsl v5.4s, v1.4h, v6.h[7] // B[0..2] -= gamma.xyz[Y[0..2] >> 4] * matrix[2][1] + + umlsl v3.4s, v2.4h, v6.h[2] // R[0..2] -= gamma.xyz[Z[0..2] >> 4] * matrix[0][2] + sqshrun v3.4h, v3.4s, #12 // clip(R[0..2] >> 12) + umlal v4.4s, v2.4h, v6.h[5] // G[0..2] += gamma.xyz[Z[0..2] >> 4] * matrix[1][2] + sqshrun v4.4h, v4.4s, #12 // clip(G[0..2] >> 12) + umlal v5.4s, v2.4h, v7.h[0] // B[0..2] += gamma.xyz[Z[0..2] >> 4] * matrix[2][2] + sqshrun v5.4h, v5.4s, #12 // clip(B[0..2] >> 12) + + umov w9, v3.h[0] // clip(R[0] >> 12) + umov w10, v4.h[0] // clip(G[0] >> 12) + umov w11, v5.h[0] // clip(B[0] >> 12) + ldrh w9, [x8, x9, lsl #1] // R[0] = gamma.rgb[clip(R[0] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G[0] = gamma.rgb[clip(G[0] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B[0] = gamma.rgb[clip(B[0] >> 12)] + lsl w9, w9, #4 // w9 = R[0] << 4 + lsl w10, w10, #4 // w10 = G[0] << 4 + lsl w11, w11, #4 // w11 = B[0] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + add x1, x1, #6 + + cmp w0, #-2 + b.lt 10f // (w & 3) == 1 + + umov w9, v3.h[1] // clip(R[1] >> 12) + umov w10, v4.h[1] // clip(G[1] >> 12) + umov w11, v5.h[1] // clip(B[1] >> 12) + ldrh w9, [x8, x9, lsl #1] // R[1] = gamma.rgb[clip(R[1] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G[1] = gamma.rgb[clip(G[1] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B[1] = gamma.rgb[clip(B[1] >> 12)] + lsl w9, w9, #4 // w9 = R[1] << 4 + lsl w10, w10, #4 // w10 = G[1] << 4 + lsl w11, w11, #4 // w11 = B[1] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + add x1, x1, #6 + + b.le 10f // (w & 3) == 2 + + umov w9, v3.h[2] // clip(R[2] >> 12) + umov w10, v4.h[2] // clip(G[2] >> 12) + umov w11, v5.h[2] // clip(B[2] >> 12) + ldrh w9, [x8, x9, lsl #1] // R[2] = gamma.rgb[clip(R[2] >> 12)] + ldrh w10, [x8, x10, lsl #1] // G[2] = gamma.rgb[clip(G[2] >> 12)] + ldrh w11, [x8, x11, lsl #1] // B[2] = gamma.rgb[clip(B[2] >> 12)] + lsl w9, w9, #4 // w9 = R[2] << 4 + lsl w10, w10, #4 // w10 = G[2] << 4 + lsl w11, w11, #4 // w11 = B[2] << 4 + strh w9, [x1] + strh w10, [x1, #2] + strh w11, [x1, #4] + add x1, x1, #6 + + .align JUMP_ALIGN +10: + ret +endfunc diff --git a/libswscale/swscale.c b/libswscale/swscale.c index c795427a83..fc4f1f6d0c 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -860,6 +860,10 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride, av_cold void ff_sws_init_xyz2rgb(SwsInternal *c) { c->xyz12Torgb48 = xyz12Torgb48_c; + +#if ARCH_AARCH64 + ff_sws_init_xyz2rgb_aarch64(c); +#endif } void ff_update_palette(SwsInternal *c, const uint32_t *pal) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 107671feb2..d1aa15af36 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -729,6 +729,7 @@ av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c); av_cold void ff_sws_init_range_convert_x86(SwsInternal *c); av_cold void ff_sws_init_xyz2rgb(SwsInternal *c); +av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c); SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c); SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c); -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
