[FFmpeg-devel] [PATCH 3/3] swscale: Add AArch64 Neon path for xyz12Torgb48 LE

Arpad Panyik via ffmpeg-devel Wed, 26 Nov 2025 07:28:38 -0800

Add optimized Neon code path for the little endian case of the
xyz12Torgb48 function. The innermost loop processes the data in 4x2
pixel blocks using software gathers with the matrix multiplication
and clipping done by Neon.


Relative runtime of micro benchmarks after this patch on some
Cortex and Neoverse CPU cores:

 xyz12le_rgb48le    X1      X3      X4    X925      V2
 16x4_neon:       2.39x   4.04x   2.84x   3.27x   3.02x
 32x4_neon:       2.42x   3.34x   2.98x   3.34x   2.97x
 64x4_neon:       2.36x   3.12x   2.99x   3.32x   2.95x
 128x4_neon:      2.36x   3.08x   3.01x   3.34x   2.95x
 256x4_neon:      2.33x   3.08x   3.08x   3.41x   2.95x
 512x4_neon:      2.30x   3.04x   3.00x   3.54x   2.88x
 1024x4_neon:     2.28x   3.01x   2.88x   3.55x   3.07x
 1920x4_neon:     2.27x   2.94x   2.79x   3.53x   2.86x

 xyz12le_rgb48le   A76     A78    A715    A720    A725
 16x4_neon:       2.36x   2.20x   2.32x   2.99x   2.98x
 32x4_neon:       2.40x   2.25x   2.37x   2.99x   3.02x
 64x4_neon:       2.37x   2.22x   2.34x   2.97x   3.03x
 128x4_neon:      2.35x   2.23x   2.33x   2.93x   3.00x
 256x4_neon:      2.39x   2.23x   2.35x   2.88x   2.92x
 512x4_neon:      2.39x   2.21x   2.32x   2.81x   2.89x
 1024x4_neon:     2.37x   2.18x   2.31x   2.79x   2.89x
 1920x4_neon:     2.37x   2.17x   2.30x   2.77x   2.86x

 xyz12le_rgb48le   A55    A510    A520
 16x4_neon:       1.98x   1.96x   2.23x
 32x4_neon:       2.03x   1.96x   2.20x
 64x4_neon:       2.01x   1.95x   2.24x
 128x4_neon:      1.99x   1.91x   2.22x
 256x4_neon:      1.92x   1.86x   2.22x
 512x4_neon:      1.89x   1.80x   2.19x
 1024x4_neon:     1.90x   1.80x   2.19x
 1920x4_neon:     1.91x   1.79x   2.20x

Signed-off-by: Arpad Panyik <[email protected]>
---
 libswscale/aarch64/Makefile       |   1 +
 libswscale/aarch64/swscale.c      |  23 +
 libswscale/aarch64/xyz2rgb_neon.S | 709 ++++++++++++++++++++++++++++++
 libswscale/swscale.c              |   4 +
 libswscale/swscale_internal.h     |   1 +
 5 files changed, 738 insertions(+)
 create mode 100644 libswscale/aarch64/xyz2rgb_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 1de8c9c0d6..1c82e34e28 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -8,4 +8,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/swscale_unscaled_neon.o  \
+               aarch64/xyz2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 55fff03a5a..80a89f7504 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -22,6 +22,18 @@
 #include "libswscale/swscale_internal.h"
 #include "libavutil/aarch64/cpu.h"
 
+void ff_xyz12Torgb48le_neon_asm(const ColorXform *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h);
+
+static void xyz12Torgb48le_neon(const SwsInternal *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h)
+{
+    return ff_xyz12Torgb48le_neon_asm(&c->xyz2rgb, dst, dst_stride,
+                                      src, src_stride, w, h);
+}
+
 void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW,
                       const uint8_t *_src, const int16_t *filter,
                       const int32_t *filterPos, int filterSize);
@@ -307,6 +319,17 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal 
*c)
     }
 }
 
+av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (!(av_pix_fmt_desc_get(c->opts.src_format)->flags & 
AV_PIX_FMT_FLAG_BE)) {
+            c->xyz12Torgb48 = xyz12Torgb48le_neon;
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libswscale/aarch64/xyz2rgb_neon.S 
b/libswscale/aarch64/xyz2rgb_neon.S
new file mode 100644
index 0000000000..b23903c9eb
--- /dev/null
+++ b/libswscale/aarch64/xyz2rgb_neon.S
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2025 Arpad Panyik <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#ifndef JUMP_ALIGN
+#define JUMP_ALIGN 2
+#endif
+#ifndef LOOP_ALIGN
+#define LOOP_ALIGN 2
+#endif
+
+#define GAMMA_XYZ 0
+#define GAMMA_RGB 8
+#define MATRIX_00 16
+#define MATRIX_22 32
+
+function ff_xyz12Torgb48le_neon_asm, export=1
+// x0  const ColorXform *c
+// x1  uint8_t *dst
+// w2  int dst_stride
+// x3  const uint8_t *src
+// w4  int src_stride
+// w5  int w
+// w6  int h
+
+        ldp             x7,  x8, [x0, #(GAMMA_XYZ)] // gamma.xyz, gamma.rgb
+        ldr             q6,  [x0, #(MATRIX_00)]     // matrix[0][0]..[2][1]
+        ldr             h7,  [x0, #(MATRIX_22)]     // matrix[2][2]; > 0
+        add             w9,  w5,  w5, lsl #1        // w * 3
+        add             x17, x3,  w4, sxtw          // sr2 = src + src_stride
+        add             x16, x1,  w2, sxtw          // ds2 = dst + dst_stride
+        sub             w4,  w4,  w9                // src_stride - w * 3
+        sub             w2,  w2,  w9                // dst_stride - w * 3
+        abs             v6.8h,  v6.8h               // 
abs(matrix[0][0]..[2][1])
+        sbfiz           x4,  x4,  #1, #32           // src_stride * 2 - w * 6
+        sbfiz           x2,  x2,  #1, #32           // dst_stride * 2 - w * 6
+
+        subs            w6,  w6,  #2
+        b.lt            6f                          // h < 2
+
+        stp             x19, x20, [sp, #-64]!
+        stp             x21, x22, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        str             x25, [sp, #48]
+
+        .align LOOP_ALIGN
+1:      // yp loop for 2x4 pixels
+        subs            w0,  w5,  #4
+        b.lt            3f                          // w < 4
+
+        .align LOOP_ALIGN
+2:      // xp loop for 2x4 pixels: XYZ0[0..3], XYZ1[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X0[0] Y0[0] Z0[0] 
X0[1], x10 = Y0[1] Z0[1] X0[2] Y0[2]
+        ldr             x11, [x3, #16]              // x11 = Z0[2] X0[3] Y0[3] 
Z0[3]
+        add             x3,  x3,  #24
+        ubfx            x12, x9,  #4,  #12          // X0[0] >> 4
+        lsr             x13, x9,  #52               // X0[1] >> 4
+        ubfx            x14, x10, #36, #12          // X0[2] >> 4
+        ubfx            x15, x11, #20, #12          // X0[3] >> 4
+
+        ldp             x19, x20, [x17]             // x19 = X1[0] Y1[0] Z1[0] 
X1[1], x20 = Y1[1] Z1[1] X1[2] Y1[2]
+        ldr             x21, [x17, #16]             // x21 = Z1[2] X1[3] Y1[3] 
Z1[3]
+        add             x17, x17, #24
+        ubfx            x22, x19, #4, #12           // X1[0] >> 4
+        lsr             x23, x19, #52               // X1[1] >> 4
+        ubfx            x24, x20, #36, #12          // X1[2] >> 4
+        ubfx            x25, x21, #20, #12          // X1[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.xyz[X0[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[X0[1] >> 4]
+        ubfx            x13, x10, #4, #12           // Y0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[X0[2] >> 4]
+        lsr             x14, x10, #52               // Y0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[X0[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y0[3] >> 4
+
+        ldr             h20, [x7, x22, lsl #1]      // gamma.xyz[X1[0] >> 4]
+        ubfx            x22, x19, #20, #12          // Y1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[X1[1] >> 4]
+        ubfx            x23, x20, #4,  #12          // Y1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[X1[2] >> 4]
+        lsr             x24, x20, #52               // Y1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[X1[3] >> 4]
+        ubfx            x25, x21, #36, #12          // Y1[3] >> 4
+
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = 
gamma.xyz[X0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[X0[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = 
gamma.xyz[X0[0..3] >> 4]
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.xyz[Y0[0] >> 4]
+        umull           v3.4s, v0.4h, v6.h[0]       // R0[0..3] = 
gamma.xyz[X0[0..3] >> 4] * matrix[0][0]
+        umull           v5.4s, v0.4h, v6.h[6]       // B0[0..3] = 
gamma.xyz[X0[0..3] >> 4] * matrix[2][0]
+        ubfx            x12, x9,  #36, #12          // Z0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Y0[1] >> 4]
+
+        mov             v20.h[1], v26.h[0]          // v20.4h = 
gamma.xyz[X1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = 
gamma.xyz[X1[2..3] >> 4]
+        mov             v20.s[1], v27.s[0]          // v20.4h = 
gamma.xyz[X1[0..3] >> 4]
+        ldr             h21, [x7, x22, lsl #1]      // gamma.xyz[Y1[0] >> 4]
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..3] = 
gamma.xyz[X1[0..3] >> 4] * matrix[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..3] = 
gamma.xyz[X1[0..3] >> 4] * matrix[2][0]
+        ubfx            x22, x19, #36, #12          // Z1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[Y1[1] >> 4]
+
+        ubfx            x13, x10, #20, #12          // Z0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Y0[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Y0[3] >> 4]
+        lsr             x15, x11, #52               // Z0[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = 
gamma.xyz[Y0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[Y0[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = 
gamma.xyz[Y0[0..3] >> 4]
+
+        ubfx            x23, x20, #20, #12          // Z1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[Y1[2] >> 4]
+        ubfx            x24, x21, #4,  #12          // Z1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[Y1[3] >> 4]
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..3]  = 
gamma.xyz[Y0[0..3] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..3] -= 
gamma.xyz[Y0[0..3] >> 4] * matrix[0][1]
+
+        lsr             x25, x21, #52               // Z1[3] >> 4
+        mov             v21.h[1], v26.h[0]          // v21.4h = 
gamma.xyz[Y1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = 
gamma.xyz[Y1[2..3] >> 4]
+        mov             v21.s[1], v27.s[0]          // v21.4h = 
gamma.xyz[Y1[0..3] >> 4]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..3] -= 
gamma.xyz[X0[0..3] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..3] -= 
gamma.xyz[Y0[0..3] >> 4] * matrix[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.xyz[Z0[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Z0[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Z0[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Z0[3] >> 4]
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..3]  = 
gamma.xyz[Y1[0..3] >> 4] * matrix[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..3] -= 
gamma.xyz[Y1[0..3] >> 4] * matrix[0][1]
+
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = 
gamma.xyz[Z0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[Z0[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = 
gamma.xyz[Z0[0..3] >> 4]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..3] -= 
gamma.xyz[X1[0..3] >> 4] * matrix[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..3] -= 
gamma.xyz[Y1[0..3] >> 4] * matrix[2][1]
+
+        ldr             h22, [x7, x22, lsl #1]      // gamma.xyz[Z1[0] >> 4]
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[Z1[1] >> 4]
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[Z1[2] >> 4]
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[Z1[3] >> 4]
+        mov             v22.h[1], v26.h[0]          // v22.4h = 
gamma.xyz[Z1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = 
gamma.xyz[Z1[2..3] >> 4]
+        mov             v22.s[1], v27.s[0]          // v22.4h = 
gamma.xyz[Z1[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..3] -= 
gamma.xyz[Z0[0..3] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..3] += 
gamma.xyz[Z0[0..3] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..3] >> 12)
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..3] += 
gamma.xyz[Z0[0..3] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..3] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..3] -= 
gamma.xyz[Z1[0..3] >> 4] * matrix[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..3] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..3] += 
gamma.xyz[Z1[0..3] >> 4] * matrix[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..3] >> 12)
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..3] += 
gamma.xyz[Z1[0..3] >> 4] * matrix[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..3] >> 12)
+
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        umov            w12, v4.h[0]                // clip(G0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = 
gamma.rgb[clip(R0[0] >> 12)]
+        lsl             x9,  x9,  #4                // R0[0] << 4
+        umov            w13, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = 
gamma.rgb[clip(G0[1] >> 12)]
+        lsl             x10, x10, #4                // G0[1] << 4
+
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        umov            w22, v24.h[0]               // clip(G1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = 
gamma.rgb[clip(R1[0] >> 12)]
+        lsl             x19, x19, #4                // R1[0] << 4
+        umov            w23, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = 
gamma.rgb[clip(G1[1] >> 12)]
+        lsl             x20, x20, #4                // G1[1] << 4
+
+        umov            w14, v3.h[3]                // clip(R0[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = 
gamma.rgb[clip(B0[2] >> 12)]
+        lsl             x11, x11, #4                // B0[2] << 4
+        umov            w15, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G0[0] = 
gamma.rgb[clip(G0[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R0[0] << 4, G0[0] << 4
+        umov            w12, v3.h[2]                // clip(R0[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B0[1] = 
gamma.rgb[clip(B0[1] >> 12)]
+
+        umov            w24, v23.h[3]               // clip(R1[3] >> 12)
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = 
gamma.rgb[clip(B1[2] >> 12)]
+        lsl             x21, x21, #4                // B1[2] << 4
+        umov            w25, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w22, [x8, x22, lsl #1]      // G1[0] = 
gamma.rgb[clip(G1[0] >> 12)]
+        orr             x19, x19, x22, lsl #20      // R1[0] << 4, G1[0] << 4
+        umov            w22, v23.h[2]               // clip(R1[2] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // B1[1] = 
gamma.rgb[clip(B1[1] >> 12)]
+
+        orr             x10, x10, x13, lsl #20      // G0[1] << 4, B0[1] << 4
+        umov            w13, v4.h[3]                // clip(G0[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R0[3] = 
gamma.rgb[clip(R0[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B0[2] << 4, R0[3] << 4
+        umov            w14, v3.h[1]                // clip(R0[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B0[0] = 
gamma.rgb[clip(B0[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R0[0] << 4, G0[0] << 4, 
B0[0] << 4
+        umov            w15, v4.h[2]                // clip(G0[2] >> 12)
+
+        orr             x20, x20, x23, lsl #20      // G1[1] << 4, B1[1] << 4
+        umov            w23, v24.h[3]               // clip(G1[3] >> 12)
+        ldrh            w24, [x8, x24, lsl #1]      // R1[3] = 
gamma.rgb[clip(R1[3] >> 12)]
+        orr             x21, x21, x24, lsl #20      // B1[2] << 4, R1[3] << 4
+        umov            w24, v23.h[1]               // clip(R1[1] >> 12)
+        ldrh            w25, [x8, x25, lsl #1]      // B1[0] = 
gamma.rgb[clip(B1[0] >> 12)]
+        orr             x19, x19, x25, lsl #36      // R1[0] << 4, G1[0] << 4, 
B1[0] << 4
+        umov            w25, v24.h[2]               // clip(G1[2] >> 12)
+
+        ldrh            w12, [x8, x12, lsl #1]      // R0[2] = 
gamma.rgb[clip(R0[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G0[1] << 4, B0[1] << 4, 
R0[2] << 4
+        umov            w12, v5.h[3]                // clip(B0[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G0[3] = 
gamma.rgb[clip(G0[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B0[2] << 4, R0[3] << 4, 
G0[3] << 4
+        ldrh            w14, [x8, x14, lsl #1]      // R0[1] = 
gamma.rgb[clip(R0[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R0[0] << 4, G0[0] 
<< 4, B0[0] << 4, R0[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G0[2] = 
gamma.rgb[clip(G0[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G0[1] << 4, B0[1] 
<< 4, R0[2] << 4, G0[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B0[3] = 
gamma.rgb[clip(B0[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B0[2] << 4, R0[3] 
<< 4, G0[3] << 4, B0[3] << 4
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+
+        ldrh            w22, [x8, x22, lsl #1]      // R1[2] = 
gamma.rgb[clip(R1[2] >> 12)]
+        orr             x20, x20, x22, lsl #36      // G1[1] << 4, B1[1] << 4, 
R1[2] << 4
+        umov            w22, v25.h[3]               // clip(B1[3] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // G1[3] = 
gamma.rgb[clip(G1[3] >> 12)]
+        orr             x21, x21, x23, lsl #36      // B1[2] << 4, R1[3] << 4, 
G1[3] << 4
+        ldrh            w24, [x8, x24, lsl #1]      // R1[1] = 
gamma.rgb[clip(R1[1] >> 12)]
+        orr             x19, x19, x24, lsl #52      // x19 = R1[0] << 4, G1[0] 
<< 4, B1[0] << 4, R1[1] << 4
+        ldrh            w25, [x8, x25, lsl #1]      // G1[2] = 
gamma.rgb[clip(G1[2] >> 12)]
+        orr             x20, x20, x25, lsl #52      // x20 = G1[1] << 4, B1[1] 
<< 4, R1[2] << 4, G1[2] << 4
+        ldrh            w22, [x8, x22, lsl #1]      // B1[3] = 
gamma.rgb[clip(B1[3] >> 12)]
+        orr             x21, x21, x22, lsl #52      // x21 = B1[2] << 4, R1[3] 
<< 4, G1[3] << 4, B1[3] << 4
+        stp             x19, x20, [x16]
+        str             x21, [x16, #16]
+
+        add             x1,  x1,  #24
+        add             x16, x16, #24
+
+        subs            w0,  w0,  #4
+        b.ge            2b
+
+        .align JUMP_ALIGN
+3:
+        tst             w5,  #3
+        b.eq            5f                          // no residual pixels; (w 
& 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X0[0] Y0[0]
+        ldrh            w11, [x3, #4]               // w11 = Z0[0]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[0] Y1[0]
+        ldrh            w21, [x17, #4]              // w21 = Z1[0]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[0] >> 4
+        lsr             w11, w11, #4                // Z0[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.xyz[X0[0] 
>> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.xyz[Y0[0] 
>> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.xyz[Z0[0] 
>> 4]
+        ubfx            w19, w20, #4,  #12          // X1[0] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[0] >> 4
+        lsr             w21, w21, #4                // Z1[0] >> 4
+        ldr             h20, [x7, x19, lsl #1]      // v20.4h = 
gamma.xyz[X1[0] >> 4]
+        ldr             h21, [x7, x20, lsl #1]      // v21.4h = 
gamma.xyz[Y1[0] >> 4]
+        ldr             h22, [x7, x21, lsl #1]      // v22.4h = 
gamma.xyz[Z1[0] >> 4]
+
+        cmp             w0,  #-2
+        b.lt            4f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X0[1] Y0[1]
+        ldrh            w11, [x3, #4]               // w11 = Z0[1]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[1] Y1[1]
+        ldrh            w21, [x17, #4]              // w21 = Z1[1]
+        add             x17, x17,  #6
+        ubfx            w9,  w10, #4,  #12          // X0[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[1] >> 4
+        lsr             w11, w11, #4                // Z0[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X0[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y0[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z0[1] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[1] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[1] >> 4
+        lsr             w21, w21, #4                // Z1[1] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.xyz[X1[1] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.xyz[Y1[1] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.xyz[Z1[1] >> 4]
+        mov             v0.h[1],  v16.h[0]          // v0.4h = 
gamma.xyz[X0[0..1] >> 4]
+        mov             v1.h[1],  v17.h[0]          // v1.4h = 
gamma.xyz[Y0[0..1] >> 4]
+        mov             v2.h[1],  v18.h[0]          // v2.4h = 
gamma.xyz[Z0[0..1] >> 4]
+        mov             v20.h[1], v23.h[0]          // v20.4h = 
gamma.xyz[X1[0..1] >> 4]
+        mov             v21.h[1], v24.h[0]          // v21.4h = 
gamma.xyz[Y1[0..1] >> 4]
+        mov             v22.h[1], v25.h[0]          // v22.4h = 
gamma.xyz[Z1[0..1] >> 4]
+
+        b.le            4f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X0[2] Y0[2]
+        ldrh            w11, [x3, #4]               // w11 = Z0[2]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[2] Y1[2]
+        ldrh            w21, [x17, #4]              // w21 = Z1[2]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[2] >> 4
+        lsr             w11, w11, #4                // Z0[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X0[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y0[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z0[2] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[2] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[2] >> 4
+        lsr             w21, w21, #4                // Z1[2] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.xyz[X1[2] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.xyz[Y1[2] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.xyz[Z1[2] >> 4]
+        mov             v0.h[2],  v16.h[0]          // v0.4h = 
gamma.xyz[X0[0..2] >> 4]
+        mov             v1.h[2],  v17.h[0]          // v1.4h = 
gamma.xyz[Y0[0..2] >> 4]
+        mov             v2.h[2],  v18.h[0]          // v2.4h = 
gamma.xyz[Z0[0..2] >> 4]
+        mov             v20.h[2], v23.h[0]          // v20.4h = 
gamma.xyz[X1[0..2] >> 4]
+        mov             v21.h[2], v24.h[0]          // v21.4h = 
gamma.xyz[Y1[0..2] >> 4]
+        mov             v22.h[2], v25.h[0]          // v22.4h = 
gamma.xyz[Z1[0..2] >> 4]
+
+        .align JUMP_ALIGN
+4:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R0[0..2] = 
gamma.xyz[X0[0..2] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B0[0..2] = 
gamma.xyz[X0[0..2] >> 4] * matrix[2][0]
+
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..2] = 
gamma.xyz[X1[0..2] >> 4] * matrix[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..2] = 
gamma.xyz[X1[0..2] >> 4] * matrix[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..2]  = 
gamma.xyz[Y0[0..2] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..2] -= 
gamma.xyz[Y0[0..2] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..2] -= 
gamma.xyz[X0[0..2] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..2] -= 
gamma.xyz[Y0[0..2] >> 4] * matrix[2][1]
+
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..2]  = 
gamma.xyz[Y1[0..2] >> 4] * matrix[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..2] -= 
gamma.xyz[Y1[0..2] >> 4] * matrix[0][1]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..2] -= 
gamma.xyz[X1[0..2] >> 4] * matrix[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..2] -= 
gamma.xyz[Y1[0..2] >> 4] * matrix[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..2] -= 
gamma.xyz[Z0[0..2] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..2] += 
gamma.xyz[Z0[0..2] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..2] += 
gamma.xyz[Z0[0..2] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..2] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..2] -= 
gamma.xyz[Z1[0..2] >> 4] * matrix[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..2] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..2] += 
gamma.xyz[Z1[0..2] >> 4] * matrix[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..2] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..2] += 
gamma.xyz[Z1[0..2] >> 4] * matrix[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G0[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = 
gamma.rgb[clip(R0[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[0] = 
gamma.rgb[clip(G0[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[0] = 
gamma.rgb[clip(B0[0] >> 12)]
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[0]               // clip(G1[0] >> 12)
+        umov            w21, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = 
gamma.rgb[clip(R1[0] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[0] = 
gamma.rgb[clip(G1[0] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[0] = 
gamma.rgb[clip(B1[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[0] << 4
+        lsl             w10, w10, #4                // w10 = G0[0] << 4
+        lsl             w11, w11, #4                // w11 = B0[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[0] << 4
+        lsl             w20, w20, #4                // w20 = G1[0] << 4
+        lsl             w21, w21, #4                // w21 = B1[0] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        cmp             w0,  #-2
+        b.lt            5f                          // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R0[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[1] = 
gamma.rgb[clip(R0[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = 
gamma.rgb[clip(G0[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[1] = 
gamma.rgb[clip(B0[1] >> 12)]
+        umov            w19, v23.h[1]               // clip(R1[1] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umov            w21, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[1] = 
gamma.rgb[clip(R1[1] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = 
gamma.rgb[clip(G1[1] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[1] = 
gamma.rgb[clip(B1[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[1] << 4
+        lsl             w10, w10, #4                // w10 = G0[1] << 4
+        lsl             w11, w11, #4                // w11 = B0[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[1] << 4
+        lsl             w20, w20, #4                // w20 = G1[1] << 4
+        lsl             w21, w21, #4                // w21 = B1[1] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        b.le            5f                          // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R0[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G0[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[2] = 
gamma.rgb[clip(R0[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[2] = 
gamma.rgb[clip(G0[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = 
gamma.rgb[clip(B0[2] >> 12)]
+        umov            w19, v23.h[2]               // clip(R1[2] >> 12)
+        umov            w20, v24.h[2]               // clip(G1[2] >> 12)
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[2] = 
gamma.rgb[clip(R1[2] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[2] = 
gamma.rgb[clip(G1[2] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = 
gamma.rgb[clip(B1[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[2] << 4
+        lsl             w10, w10, #4                // w10 = G0[2] << 4
+        lsl             w11, w11, #4                // w11 = B0[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[2] << 4
+        lsl             w20, w20, #4                // w20 = G1[2] << 4
+        lsl             w21, w21, #4                // w21 = B1[2] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        .align JUMP_ALIGN
+5:
+        add             x3,  x3,  x4
+        add             x17, x17, x4
+        add             x1,  x1,  x2
+        add             x16, x16, x2
+
+        subs            w6,  w6,  #2
+        b.ge            1b
+
+        ldp             x21, x22, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldr             x25, [sp, #48]
+        ldp             x19, x20, [sp], #64
+
+        .align JUMP_ALIGN
+6:
+        tbz             w6,  #0,  10f               // even number of lines; 
(h & 1) == 0
+
+        subs            w0,  w5,  #4
+        b.lt            8f                          // w < 4
+
+        .align LOOP_ALIGN
+7:      // loop for last odd line by 4 pixels: XYZ[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X[0] Y[0] Z[0] 
X[1], x10 = Y[1] Z[1] X[2] Y[2]
+        ldr             x11, [x3, #16]              // x11 = Z[2] X[3] Y[3] 
Z[3]
+        add             x3,  x3,  #24
+
+        ubfx            x12, x9,  #4,  #12          // X[0] >> 4
+        lsr             x13, x9,  #52               // X[1] >> 4
+        ubfx            x14, x10, #36, #12          // X[2] >> 4
+        ubfx            x15, x11, #20, #12          // X[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.xyz[X[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[X[1] >> 4]
+        ubfx            x13, x10, #4,  #12          // Y[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[X[2] >> 4]
+        lsr             x14, x10, #52               // Y[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[X[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y[3] >> 4
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = 
gamma.xyz[X[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[X[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = 
gamma.xyz[X[0..3] >> 4]
+
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..3] = 
gamma.xyz[X[0..3] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..3] = 
gamma.xyz[X[0..3] >> 4] * matrix[2][0]
+
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.xyz[Y[0] >> 4]
+        ubfx            x12, x9,  #36, #12          // Z[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Y[1] >> 4]
+        ubfx            x13, x10, #20, #12          // Z[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Y[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Y[3] >> 4]
+        lsr             x15, x11, #52               // Z[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = 
gamma.xyz[Y[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[Y[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = 
gamma.xyz[Y[0..3] >> 4]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..3]  = 
gamma.xyz[Y[0..3] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..3] -= 
gamma.xyz[Y[0..3] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..3] -= 
gamma.xyz[X[0..3] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..3] -= 
gamma.xyz[Y[0..3] >> 4] * matrix[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.xyz[Z[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Z[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Z[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Z[3] >> 4]
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = 
gamma.xyz[Z[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = 
gamma.xyz[Z[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = 
gamma.xyz[Z[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..3] -= 
gamma.xyz[Z[0..3] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..3] += 
gamma.xyz[Z[0..3] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..3] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..3] += 
gamma.xyz[Z[0..3] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..3] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+
+        umov            w12, v4.h[0]                // clip(G[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = 
gamma.rgb[clip(R[0] >> 12)]
+        lsl             x9,  x9,  #4                // R[0] << 4
+        umov            w13, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = 
gamma.rgb[clip(G[1] >> 12)]
+        lsl             x10, x10, #4                // G[1] << 4
+        umov            w14, v3.h[3]                // clip(R[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = 
gamma.rgb[clip(B[2] >> 12)]
+        lsl             x11, x11, #4                // B[2] << 4
+
+        umov            w15, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G[0] = 
gamma.rgb[clip(G[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R[0] << 4, G[0] << 4
+        umov            w12, v3.h[2]                // clip(R[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B[1] = 
gamma.rgb[clip(B[1] >> 12)]
+        orr             x10, x10, x13, lsl #20      // G[1] << 4, B[1] << 4
+        umov            w13, v4.h[3]                // clip(G[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R[3] = 
gamma.rgb[clip(R[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B[2] << 4, R[3] << 4
+
+        umov            w14, v3.h[1]                // clip(R[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B[0] = 
gamma.rgb[clip(B[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R[0] << 4, G[0] << 4, 
B[0] << 4
+        umov            w15, v4.h[2]                // clip(G[2] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // R[2] = 
gamma.rgb[clip(R[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G[1] << 4, B[1] << 4, 
R[2] << 4
+        umov            w12, v5.h[3]                // clip(B[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G[3] = 
gamma.rgb[clip(G[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B[2] << 4, R[3] << 4, 
G[3] << 4
+
+        ldrh            w14, [x8, x14, lsl #1]      // R[1] = 
gamma.rgb[clip(R[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R[0] << 4, G[0] 
<< 4, B[0] << 4, R[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G[2] = 
gamma.rgb[clip(G[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G[1] << 4, B[1] 
<< 4, R[2] << 4, G[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B[3] = 
gamma.rgb[clip(B[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B[2] << 4, R[3] 
<< 4, G[3] << 4, B[3] << 4
+
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+        add             x1,  x1,  #24
+
+        subs            w0,  w0,  #4
+        b.ge            7b
+
+        .align JUMP_ALIGN
+8:
+        tst             w5,  #3
+        b.eq            10f                         // no residual pixels; (w 
& 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X[0] Y[0]
+        ldrh            w11, [x3, #4]               // w11 = Z[0]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4,  #12          // X[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y[0] >> 4
+        lsr             w11, w11, #4                // Z[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.xyz[X[0] 
>> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.xyz[Y[0] 
>> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.xyz[Z[0] 
>> 4]
+
+        cmp             w0,  #-2
+        b.lt            9f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X[1] Y[1]
+        ldrh            w11, [x3, #4]               // w11 = Z[1]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y[1] >> 4
+        lsr             w11, w11, #4                // Z[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z[1] >> 4]
+        mov             v0.h[1], v16.h[0]           // v0.4h = 
gamma.xyz[X[0..1] >> 4]
+        mov             v1.h[1], v17.h[0]           // v1.4h = 
gamma.xyz[Y[0..1] >> 4]
+        mov             v2.h[1], v18.h[0]           // v2.4h = 
gamma.xyz[Z[0..1] >> 4]
+
+        b.le            9f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X[2] Y[2]
+        ldrh            w11, [x3, #4]               // w11 = Z[2]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y[2] >> 4
+        lsr             w11, w11, #4                // Z[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z[2] >> 4]
+        mov             v0.h[2], v16.h[0]           // v0.4h = 
gamma.xyz[X[0..2] >> 4]
+        mov             v1.h[2], v17.h[0]           // v1.4h = 
gamma.xyz[Y[0..2] >> 4]
+        mov             v2.h[2], v18.h[0]           // v2.4h = 
gamma.xyz[Z[0..2] >> 4]
+
+        .align JUMP_ALIGN
+9:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..2] = 
gamma.xyz[X[0..2] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..2] = 
gamma.xyz[X[0..2] >> 4] * matrix[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..2]  = 
gamma.xyz[Y[0..2] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..2] -= 
gamma.xyz[Y[0..2] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..2] -= 
gamma.xyz[X[0..2] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..2] -= 
gamma.xyz[Y[0..2] >> 4] * matrix[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..2] -= 
gamma.xyz[Z[0..2] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..2] += 
gamma.xyz[Z[0..2] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..2] += 
gamma.xyz[Z[0..2] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = 
gamma.rgb[clip(R[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[0] = 
gamma.rgb[clip(G[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[0] = 
gamma.rgb[clip(B[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[0] << 4
+        lsl             w10, w10, #4                // w10 = G[0] << 4
+        lsl             w11, w11, #4                // w11 = B[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        cmp             w0,  #-2
+        b.lt            10f                         // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[1] = 
gamma.rgb[clip(R[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = 
gamma.rgb[clip(G[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[1] = 
gamma.rgb[clip(B[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[1] << 4
+        lsl             w10, w10, #4                // w10 = G[1] << 4
+        lsl             w11, w11, #4                // w11 = B[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        b.le            10f                         // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[2] = 
gamma.rgb[clip(R[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[2] = 
gamma.rgb[clip(G[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = 
gamma.rgb[clip(B[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[2] << 4
+        lsl             w10, w10, #4                // w10 = G[2] << 4
+        lsl             w11, w11, #4                // w11 = B[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        .align JUMP_ALIGN
+10:
+        ret
+endfunc
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index c795427a83..fc4f1f6d0c 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -860,6 +860,10 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, 
int dst_stride,
 av_cold void ff_sws_init_xyz2rgb(SwsInternal *c)
 {
     c->xyz12Torgb48 = xyz12Torgb48_c;
+
+#if ARCH_AARCH64
+    ff_sws_init_xyz2rgb_aarch64(c);
+#endif
 }
 
 void ff_update_palette(SwsInternal *c, const uint32_t *pal)
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 107671feb2..d1aa15af36 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -729,6 +729,7 @@ av_cold void ff_sws_init_range_convert_riscv(SwsInternal 
*c);
 av_cold void ff_sws_init_range_convert_x86(SwsInternal *c);
 
 av_cold void ff_sws_init_xyz2rgb(SwsInternal *c);
+av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c);
-- 
2.43.0

>From 10c3422e21eb34aafddb27ba2bb3ed72444c3cee Mon Sep 17 00:00:00 2001
From: Arpad Panyik <[email protected]>
Date: Wed, 26 Nov 2025 09:28:20 +0000
Subject: [PATCH 3/3] swscale: Add AArch64 Neon path for xyz12Torgb48 LE

Add optimized Neon code path for the little endian case of the
xyz12Torgb48 function. The innermost loop processes the data in 4x2
pixel blocks using software gathers with the matrix multiplication
and clipping done by Neon.

Relative runtime of micro benchmarks after this patch on some
Cortex and Neoverse CPU cores:

 xyz12le_rgb48le    X1      X3      X4    X925      V2
 16x4_neon:       2.39x   4.04x   2.84x   3.27x   3.02x
 32x4_neon:       2.42x   3.34x   2.98x   3.34x   2.97x
 64x4_neon:       2.36x   3.12x   2.99x   3.32x   2.95x
 128x4_neon:      2.36x   3.08x   3.01x   3.34x   2.95x
 256x4_neon:      2.33x   3.08x   3.08x   3.41x   2.95x
 512x4_neon:      2.30x   3.04x   3.00x   3.54x   2.88x
 1024x4_neon:     2.28x   3.01x   2.88x   3.55x   3.07x
 1920x4_neon:     2.27x   2.94x   2.79x   3.53x   2.86x

 xyz12le_rgb48le   A76     A78    A715    A720    A725
 16x4_neon:       2.36x   2.20x   2.32x   2.99x   2.98x
 32x4_neon:       2.40x   2.25x   2.37x   2.99x   3.02x
 64x4_neon:       2.37x   2.22x   2.34x   2.97x   3.03x
 128x4_neon:      2.35x   2.23x   2.33x   2.93x   3.00x
 256x4_neon:      2.39x   2.23x   2.35x   2.88x   2.92x
 512x4_neon:      2.39x   2.21x   2.32x   2.81x   2.89x
 1024x4_neon:     2.37x   2.18x   2.31x   2.79x   2.89x
 1920x4_neon:     2.37x   2.17x   2.30x   2.77x   2.86x

 xyz12le_rgb48le   A55    A510    A520
 16x4_neon:       1.98x   1.96x   2.23x
 32x4_neon:       2.03x   1.96x   2.20x
 64x4_neon:       2.01x   1.95x   2.24x
 128x4_neon:      1.99x   1.91x   2.22x
 256x4_neon:      1.92x   1.86x   2.22x
 512x4_neon:      1.89x   1.80x   2.19x
 1024x4_neon:     1.90x   1.80x   2.19x
 1920x4_neon:     1.91x   1.79x   2.20x

Signed-off-by: Arpad Panyik <[email protected]>
---
 libswscale/aarch64/Makefile       |   1 +
 libswscale/aarch64/swscale.c      |  23 +
 libswscale/aarch64/xyz2rgb_neon.S | 709 ++++++++++++++++++++++++++++++
 libswscale/swscale.c              |   4 +
 libswscale/swscale_internal.h     |   1 +
 5 files changed, 738 insertions(+)
 create mode 100644 libswscale/aarch64/xyz2rgb_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 1de8c9c0d6..1c82e34e28 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -8,4 +8,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/swscale_unscaled_neon.o  \
+               aarch64/xyz2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 55fff03a5a..80a89f7504 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -22,6 +22,18 @@
 #include "libswscale/swscale_internal.h"
 #include "libavutil/aarch64/cpu.h"
 
+void ff_xyz12Torgb48le_neon_asm(const ColorXform *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h);
+
+static void xyz12Torgb48le_neon(const SwsInternal *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h)
+{
+    return ff_xyz12Torgb48le_neon_asm(&c->xyz2rgb, dst, dst_stride,
+                                      src, src_stride, w, h);
+}
+
 void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW,
                       const uint8_t *_src, const int16_t *filter,
                       const int32_t *filterPos, int filterSize);
@@ -307,6 +319,17 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
     }
 }
 
+av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (!(av_pix_fmt_desc_get(c->opts.src_format)->flags & AV_PIX_FMT_FLAG_BE)) {
+            c->xyz12Torgb48 = xyz12Torgb48le_neon;
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libswscale/aarch64/xyz2rgb_neon.S b/libswscale/aarch64/xyz2rgb_neon.S
new file mode 100644
index 0000000000..b23903c9eb
--- /dev/null
+++ b/libswscale/aarch64/xyz2rgb_neon.S
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2025 Arpad Panyik <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#ifndef JUMP_ALIGN
+#define JUMP_ALIGN 2
+#endif
+#ifndef LOOP_ALIGN
+#define LOOP_ALIGN 2
+#endif
+
+#define GAMMA_XYZ 0
+#define GAMMA_RGB 8
+#define MATRIX_00 16
+#define MATRIX_22 32
+
+function ff_xyz12Torgb48le_neon_asm, export=1
+// x0  const ColorXform *c
+// x1  uint8_t *dst
+// w2  int dst_stride
+// x3  const uint8_t *src
+// w4  int src_stride
+// w5  int w
+// w6  int h
+
+        ldp             x7,  x8, [x0, #(GAMMA_XYZ)] // gamma.xyz, gamma.rgb
+        ldr             q6,  [x0, #(MATRIX_00)]     // matrix[0][0]..[2][1]
+        ldr             h7,  [x0, #(MATRIX_22)]     // matrix[2][2]; > 0
+        add             w9,  w5,  w5, lsl #1        // w * 3
+        add             x17, x3,  w4, sxtw          // sr2 = src + src_stride
+        add             x16, x1,  w2, sxtw          // ds2 = dst + dst_stride
+        sub             w4,  w4,  w9                // src_stride - w * 3
+        sub             w2,  w2,  w9                // dst_stride - w * 3
+        abs             v6.8h,  v6.8h               // abs(matrix[0][0]..[2][1])
+        sbfiz           x4,  x4,  #1, #32           // src_stride * 2 - w * 6
+        sbfiz           x2,  x2,  #1, #32           // dst_stride * 2 - w * 6
+
+        subs            w6,  w6,  #2
+        b.lt            6f                          // h < 2
+
+        stp             x19, x20, [sp, #-64]!
+        stp             x21, x22, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        str             x25, [sp, #48]
+
+        .align LOOP_ALIGN
+1:      // yp loop for 2x4 pixels
+        subs            w0,  w5,  #4
+        b.lt            3f                          // w < 4
+
+        .align LOOP_ALIGN
+2:      // xp loop for 2x4 pixels: XYZ0[0..3], XYZ1[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X0[0] Y0[0] Z0[0] X0[1], x10 = Y0[1] Z0[1] X0[2] Y0[2]
+        ldr             x11, [x3, #16]              // x11 = Z0[2] X0[3] Y0[3] Z0[3]
+        add             x3,  x3,  #24
+        ubfx            x12, x9,  #4,  #12          // X0[0] >> 4
+        lsr             x13, x9,  #52               // X0[1] >> 4
+        ubfx            x14, x10, #36, #12          // X0[2] >> 4
+        ubfx            x15, x11, #20, #12          // X0[3] >> 4
+
+        ldp             x19, x20, [x17]             // x19 = X1[0] Y1[0] Z1[0] X1[1], x20 = Y1[1] Z1[1] X1[2] Y1[2]
+        ldr             x21, [x17, #16]             // x21 = Z1[2] X1[3] Y1[3] Z1[3]
+        add             x17, x17, #24
+        ubfx            x22, x19, #4, #12           // X1[0] >> 4
+        lsr             x23, x19, #52               // X1[1] >> 4
+        ubfx            x24, x20, #36, #12          // X1[2] >> 4
+        ubfx            x25, x21, #20, #12          // X1[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.xyz[X0[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[X0[1] >> 4]
+        ubfx            x13, x10, #4, #12           // Y0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[X0[2] >> 4]
+        lsr             x14, x10, #52               // Y0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[X0[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y0[3] >> 4
+
+        ldr             h20, [x7, x22, lsl #1]      // gamma.xyz[X1[0] >> 4]
+        ubfx            x22, x19, #20, #12          // Y1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[X1[1] >> 4]
+        ubfx            x23, x20, #4,  #12          // Y1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[X1[2] >> 4]
+        lsr             x24, x20, #52               // Y1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[X1[3] >> 4]
+        ubfx            x25, x21, #36, #12          // Y1[3] >> 4
+
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = gamma.xyz[X0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[X0[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = gamma.xyz[X0[0..3] >> 4]
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.xyz[Y0[0] >> 4]
+        umull           v3.4s, v0.4h, v6.h[0]       // R0[0..3] = gamma.xyz[X0[0..3] >> 4] * matrix[0][0]
+        umull           v5.4s, v0.4h, v6.h[6]       // B0[0..3] = gamma.xyz[X0[0..3] >> 4] * matrix[2][0]
+        ubfx            x12, x9,  #36, #12          // Z0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Y0[1] >> 4]
+
+        mov             v20.h[1], v26.h[0]          // v20.4h = gamma.xyz[X1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.xyz[X1[2..3] >> 4]
+        mov             v20.s[1], v27.s[0]          // v20.4h = gamma.xyz[X1[0..3] >> 4]
+        ldr             h21, [x7, x22, lsl #1]      // gamma.xyz[Y1[0] >> 4]
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..3] = gamma.xyz[X1[0..3] >> 4] * matrix[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..3] = gamma.xyz[X1[0..3] >> 4] * matrix[2][0]
+        ubfx            x22, x19, #36, #12          // Z1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[Y1[1] >> 4]
+
+        ubfx            x13, x10, #20, #12          // Z0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Y0[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Y0[3] >> 4]
+        lsr             x15, x11, #52               // Z0[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = gamma.xyz[Y0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[Y0[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = gamma.xyz[Y0[0..3] >> 4]
+
+        ubfx            x23, x20, #20, #12          // Z1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[Y1[2] >> 4]
+        ubfx            x24, x21, #4,  #12          // Z1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[Y1[3] >> 4]
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..3]  = gamma.xyz[Y0[0..3] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..3] -= gamma.xyz[Y0[0..3] >> 4] * matrix[0][1]
+
+        lsr             x25, x21, #52               // Z1[3] >> 4
+        mov             v21.h[1], v26.h[0]          // v21.4h = gamma.xyz[Y1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.xyz[Y1[2..3] >> 4]
+        mov             v21.s[1], v27.s[0]          // v21.4h = gamma.xyz[Y1[0..3] >> 4]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..3] -= gamma.xyz[X0[0..3] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..3] -= gamma.xyz[Y0[0..3] >> 4] * matrix[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.xyz[Z0[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Z0[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Z0[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Z0[3] >> 4]
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..3]  = gamma.xyz[Y1[0..3] >> 4] * matrix[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..3] -= gamma.xyz[Y1[0..3] >> 4] * matrix[0][1]
+
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = gamma.xyz[Z0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[Z0[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = gamma.xyz[Z0[0..3] >> 4]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..3] -= gamma.xyz[X1[0..3] >> 4] * matrix[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..3] -= gamma.xyz[Y1[0..3] >> 4] * matrix[2][1]
+
+        ldr             h22, [x7, x22, lsl #1]      // gamma.xyz[Z1[0] >> 4]
+        ldr             h26, [x7, x23, lsl #1]      // gamma.xyz[Z1[1] >> 4]
+        ldr             h27, [x7, x24, lsl #1]      // gamma.xyz[Z1[2] >> 4]
+        ldr             h28, [x7, x25, lsl #1]      // gamma.xyz[Z1[3] >> 4]
+        mov             v22.h[1], v26.h[0]          // v22.4h = gamma.xyz[Z1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.xyz[Z1[2..3] >> 4]
+        mov             v22.s[1], v27.s[0]          // v22.4h = gamma.xyz[Z1[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..3] -= gamma.xyz[Z0[0..3] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..3] += gamma.xyz[Z0[0..3] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..3] >> 12)
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..3] += gamma.xyz[Z0[0..3] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..3] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..3] -= gamma.xyz[Z1[0..3] >> 4] * matrix[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..3] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..3] += gamma.xyz[Z1[0..3] >> 4] * matrix[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..3] >> 12)
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..3] += gamma.xyz[Z1[0..3] >> 4] * matrix[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..3] >> 12)
+
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        umov            w12, v4.h[0]                // clip(G0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = gamma.rgb[clip(R0[0] >> 12)]
+        lsl             x9,  x9,  #4                // R0[0] << 4
+        umov            w13, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = gamma.rgb[clip(G0[1] >> 12)]
+        lsl             x10, x10, #4                // G0[1] << 4
+
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        umov            w22, v24.h[0]               // clip(G1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = gamma.rgb[clip(R1[0] >> 12)]
+        lsl             x19, x19, #4                // R1[0] << 4
+        umov            w23, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = gamma.rgb[clip(G1[1] >> 12)]
+        lsl             x20, x20, #4                // G1[1] << 4
+
+        umov            w14, v3.h[3]                // clip(R0[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = gamma.rgb[clip(B0[2] >> 12)]
+        lsl             x11, x11, #4                // B0[2] << 4
+        umov            w15, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G0[0] = gamma.rgb[clip(G0[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R0[0] << 4, G0[0] << 4
+        umov            w12, v3.h[2]                // clip(R0[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B0[1] = gamma.rgb[clip(B0[1] >> 12)]
+
+        umov            w24, v23.h[3]               // clip(R1[3] >> 12)
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = gamma.rgb[clip(B1[2] >> 12)]
+        lsl             x21, x21, #4                // B1[2] << 4
+        umov            w25, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w22, [x8, x22, lsl #1]      // G1[0] = gamma.rgb[clip(G1[0] >> 12)]
+        orr             x19, x19, x22, lsl #20      // R1[0] << 4, G1[0] << 4
+        umov            w22, v23.h[2]               // clip(R1[2] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // B1[1] = gamma.rgb[clip(B1[1] >> 12)]
+
+        orr             x10, x10, x13, lsl #20      // G0[1] << 4, B0[1] << 4
+        umov            w13, v4.h[3]                // clip(G0[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R0[3] = gamma.rgb[clip(R0[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B0[2] << 4, R0[3] << 4
+        umov            w14, v3.h[1]                // clip(R0[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B0[0] = gamma.rgb[clip(B0[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R0[0] << 4, G0[0] << 4, B0[0] << 4
+        umov            w15, v4.h[2]                // clip(G0[2] >> 12)
+
+        orr             x20, x20, x23, lsl #20      // G1[1] << 4, B1[1] << 4
+        umov            w23, v24.h[3]               // clip(G1[3] >> 12)
+        ldrh            w24, [x8, x24, lsl #1]      // R1[3] = gamma.rgb[clip(R1[3] >> 12)]
+        orr             x21, x21, x24, lsl #20      // B1[2] << 4, R1[3] << 4
+        umov            w24, v23.h[1]               // clip(R1[1] >> 12)
+        ldrh            w25, [x8, x25, lsl #1]      // B1[0] = gamma.rgb[clip(B1[0] >> 12)]
+        orr             x19, x19, x25, lsl #36      // R1[0] << 4, G1[0] << 4, B1[0] << 4
+        umov            w25, v24.h[2]               // clip(G1[2] >> 12)
+
+        ldrh            w12, [x8, x12, lsl #1]      // R0[2] = gamma.rgb[clip(R0[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G0[1] << 4, B0[1] << 4, R0[2] << 4
+        umov            w12, v5.h[3]                // clip(B0[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G0[3] = gamma.rgb[clip(G0[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B0[2] << 4, R0[3] << 4, G0[3] << 4
+        ldrh            w14, [x8, x14, lsl #1]      // R0[1] = gamma.rgb[clip(R0[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R0[0] << 4, G0[0] << 4, B0[0] << 4, R0[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G0[2] = gamma.rgb[clip(G0[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G0[1] << 4, B0[1] << 4, R0[2] << 4, G0[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B0[3] = gamma.rgb[clip(B0[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B0[2] << 4, R0[3] << 4, G0[3] << 4, B0[3] << 4
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+
+        ldrh            w22, [x8, x22, lsl #1]      // R1[2] = gamma.rgb[clip(R1[2] >> 12)]
+        orr             x20, x20, x22, lsl #36      // G1[1] << 4, B1[1] << 4, R1[2] << 4
+        umov            w22, v25.h[3]               // clip(B1[3] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // G1[3] = gamma.rgb[clip(G1[3] >> 12)]
+        orr             x21, x21, x23, lsl #36      // B1[2] << 4, R1[3] << 4, G1[3] << 4
+        ldrh            w24, [x8, x24, lsl #1]      // R1[1] = gamma.rgb[clip(R1[1] >> 12)]
+        orr             x19, x19, x24, lsl #52      // x19 = R1[0] << 4, G1[0] << 4, B1[0] << 4, R1[1] << 4
+        ldrh            w25, [x8, x25, lsl #1]      // G1[2] = gamma.rgb[clip(G1[2] >> 12)]
+        orr             x20, x20, x25, lsl #52      // x20 = G1[1] << 4, B1[1] << 4, R1[2] << 4, G1[2] << 4
+        ldrh            w22, [x8, x22, lsl #1]      // B1[3] = gamma.rgb[clip(B1[3] >> 12)]
+        orr             x21, x21, x22, lsl #52      // x21 = B1[2] << 4, R1[3] << 4, G1[3] << 4, B1[3] << 4
+        stp             x19, x20, [x16]
+        str             x21, [x16, #16]
+
+        add             x1,  x1,  #24
+        add             x16, x16, #24
+
+        subs            w0,  w0,  #4
+        b.ge            2b
+
+        .align JUMP_ALIGN
+3:
+        tst             w5,  #3
+        b.eq            5f                          // no residual pixels; (w & 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X0[0] Y0[0]
+        ldrh            w11, [x3, #4]               // w11 = Z0[0]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[0] Y1[0]
+        ldrh            w21, [x17, #4]              // w21 = Z1[0]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[0] >> 4
+        lsr             w11, w11, #4                // Z0[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.xyz[X0[0] >> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.xyz[Y0[0] >> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.xyz[Z0[0] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[0] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[0] >> 4
+        lsr             w21, w21, #4                // Z1[0] >> 4
+        ldr             h20, [x7, x19, lsl #1]      // v20.4h = gamma.xyz[X1[0] >> 4]
+        ldr             h21, [x7, x20, lsl #1]      // v21.4h = gamma.xyz[Y1[0] >> 4]
+        ldr             h22, [x7, x21, lsl #1]      // v22.4h = gamma.xyz[Z1[0] >> 4]
+
+        cmp             w0,  #-2
+        b.lt            4f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X0[1] Y0[1]
+        ldrh            w11, [x3, #4]               // w11 = Z0[1]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[1] Y1[1]
+        ldrh            w21, [x17, #4]              // w21 = Z1[1]
+        add             x17, x17,  #6
+        ubfx            w9,  w10, #4,  #12          // X0[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[1] >> 4
+        lsr             w11, w11, #4                // Z0[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X0[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y0[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z0[1] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[1] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[1] >> 4
+        lsr             w21, w21, #4                // Z1[1] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.xyz[X1[1] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.xyz[Y1[1] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.xyz[Z1[1] >> 4]
+        mov             v0.h[1],  v16.h[0]          // v0.4h = gamma.xyz[X0[0..1] >> 4]
+        mov             v1.h[1],  v17.h[0]          // v1.4h = gamma.xyz[Y0[0..1] >> 4]
+        mov             v2.h[1],  v18.h[0]          // v2.4h = gamma.xyz[Z0[0..1] >> 4]
+        mov             v20.h[1], v23.h[0]          // v20.4h = gamma.xyz[X1[0..1] >> 4]
+        mov             v21.h[1], v24.h[0]          // v21.4h = gamma.xyz[Y1[0..1] >> 4]
+        mov             v22.h[1], v25.h[0]          // v22.4h = gamma.xyz[Z1[0..1] >> 4]
+
+        b.le            4f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X0[2] Y0[2]
+        ldrh            w11, [x3, #4]               // w11 = Z0[2]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[2] Y1[2]
+        ldrh            w21, [x17, #4]              // w21 = Z1[2]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[2] >> 4
+        lsr             w11, w11, #4                // Z0[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X0[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y0[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z0[2] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[2] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[2] >> 4
+        lsr             w21, w21, #4                // Z1[2] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.xyz[X1[2] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.xyz[Y1[2] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.xyz[Z1[2] >> 4]
+        mov             v0.h[2],  v16.h[0]          // v0.4h = gamma.xyz[X0[0..2] >> 4]
+        mov             v1.h[2],  v17.h[0]          // v1.4h = gamma.xyz[Y0[0..2] >> 4]
+        mov             v2.h[2],  v18.h[0]          // v2.4h = gamma.xyz[Z0[0..2] >> 4]
+        mov             v20.h[2], v23.h[0]          // v20.4h = gamma.xyz[X1[0..2] >> 4]
+        mov             v21.h[2], v24.h[0]          // v21.4h = gamma.xyz[Y1[0..2] >> 4]
+        mov             v22.h[2], v25.h[0]          // v22.4h = gamma.xyz[Z1[0..2] >> 4]
+
+        .align JUMP_ALIGN
+4:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R0[0..2] = gamma.xyz[X0[0..2] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B0[0..2] = gamma.xyz[X0[0..2] >> 4] * matrix[2][0]
+
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..2] = gamma.xyz[X1[0..2] >> 4] * matrix[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..2] = gamma.xyz[X1[0..2] >> 4] * matrix[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..2]  = gamma.xyz[Y0[0..2] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..2] -= gamma.xyz[Y0[0..2] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..2] -= gamma.xyz[X0[0..2] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..2] -= gamma.xyz[Y0[0..2] >> 4] * matrix[2][1]
+
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..2]  = gamma.xyz[Y1[0..2] >> 4] * matrix[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..2] -= gamma.xyz[Y1[0..2] >> 4] * matrix[0][1]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..2] -= gamma.xyz[X1[0..2] >> 4] * matrix[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..2] -= gamma.xyz[Y1[0..2] >> 4] * matrix[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..2] -= gamma.xyz[Z0[0..2] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..2] += gamma.xyz[Z0[0..2] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..2] += gamma.xyz[Z0[0..2] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..2] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..2] -= gamma.xyz[Z1[0..2] >> 4] * matrix[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..2] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..2] += gamma.xyz[Z1[0..2] >> 4] * matrix[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..2] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..2] += gamma.xyz[Z1[0..2] >> 4] * matrix[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G0[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = gamma.rgb[clip(R0[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[0] = gamma.rgb[clip(G0[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[0] = gamma.rgb[clip(B0[0] >> 12)]
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[0]               // clip(G1[0] >> 12)
+        umov            w21, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = gamma.rgb[clip(R1[0] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[0] = gamma.rgb[clip(G1[0] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[0] = gamma.rgb[clip(B1[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[0] << 4
+        lsl             w10, w10, #4                // w10 = G0[0] << 4
+        lsl             w11, w11, #4                // w11 = B0[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[0] << 4
+        lsl             w20, w20, #4                // w20 = G1[0] << 4
+        lsl             w21, w21, #4                // w21 = B1[0] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        cmp             w0,  #-2
+        b.lt            5f                          // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R0[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[1] = gamma.rgb[clip(R0[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = gamma.rgb[clip(G0[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[1] = gamma.rgb[clip(B0[1] >> 12)]
+        umov            w19, v23.h[1]               // clip(R1[1] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umov            w21, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[1] = gamma.rgb[clip(R1[1] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = gamma.rgb[clip(G1[1] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[1] = gamma.rgb[clip(B1[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[1] << 4
+        lsl             w10, w10, #4                // w10 = G0[1] << 4
+        lsl             w11, w11, #4                // w11 = B0[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[1] << 4
+        lsl             w20, w20, #4                // w20 = G1[1] << 4
+        lsl             w21, w21, #4                // w21 = B1[1] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        b.le            5f                          // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R0[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G0[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[2] = gamma.rgb[clip(R0[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[2] = gamma.rgb[clip(G0[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = gamma.rgb[clip(B0[2] >> 12)]
+        umov            w19, v23.h[2]               // clip(R1[2] >> 12)
+        umov            w20, v24.h[2]               // clip(G1[2] >> 12)
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[2] = gamma.rgb[clip(R1[2] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[2] = gamma.rgb[clip(G1[2] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = gamma.rgb[clip(B1[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[2] << 4
+        lsl             w10, w10, #4                // w10 = G0[2] << 4
+        lsl             w11, w11, #4                // w11 = B0[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[2] << 4
+        lsl             w20, w20, #4                // w20 = G1[2] << 4
+        lsl             w21, w21, #4                // w21 = B1[2] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        .align JUMP_ALIGN
+5:
+        add             x3,  x3,  x4
+        add             x17, x17, x4
+        add             x1,  x1,  x2
+        add             x16, x16, x2
+
+        subs            w6,  w6,  #2
+        b.ge            1b
+
+        ldp             x21, x22, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldr             x25, [sp, #48]
+        ldp             x19, x20, [sp], #64
+
+        .align JUMP_ALIGN
+6:
+        tbz             w6,  #0,  10f               // even number of lines; (h & 1) == 0
+
+        subs            w0,  w5,  #4
+        b.lt            8f                          // w < 4
+
+        .align LOOP_ALIGN
+7:      // loop for last odd line by 4 pixels: XYZ[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X[0] Y[0] Z[0] X[1], x10 = Y[1] Z[1] X[2] Y[2]
+        ldr             x11, [x3, #16]              // x11 = Z[2] X[3] Y[3] Z[3]
+        add             x3,  x3,  #24
+
+        ubfx            x12, x9,  #4,  #12          // X[0] >> 4
+        lsr             x13, x9,  #52               // X[1] >> 4
+        ubfx            x14, x10, #36, #12          // X[2] >> 4
+        ubfx            x15, x11, #20, #12          // X[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.xyz[X[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[X[1] >> 4]
+        ubfx            x13, x10, #4,  #12          // Y[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[X[2] >> 4]
+        lsr             x14, x10, #52               // Y[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[X[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y[3] >> 4
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = gamma.xyz[X[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[X[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = gamma.xyz[X[0..3] >> 4]
+
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..3] = gamma.xyz[X[0..3] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..3] = gamma.xyz[X[0..3] >> 4] * matrix[2][0]
+
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.xyz[Y[0] >> 4]
+        ubfx            x12, x9,  #36, #12          // Z[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Y[1] >> 4]
+        ubfx            x13, x10, #20, #12          // Z[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Y[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Y[3] >> 4]
+        lsr             x15, x11, #52               // Z[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = gamma.xyz[Y[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[Y[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = gamma.xyz[Y[0..3] >> 4]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..3]  = gamma.xyz[Y[0..3] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..3] -= gamma.xyz[Y[0..3] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..3] -= gamma.xyz[X[0..3] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..3] -= gamma.xyz[Y[0..3] >> 4] * matrix[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.xyz[Z[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.xyz[Z[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.xyz[Z[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.xyz[Z[3] >> 4]
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = gamma.xyz[Z[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.xyz[Z[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = gamma.xyz[Z[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..3] -= gamma.xyz[Z[0..3] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..3] += gamma.xyz[Z[0..3] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..3] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..3] += gamma.xyz[Z[0..3] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..3] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+
+        umov            w12, v4.h[0]                // clip(G[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = gamma.rgb[clip(R[0] >> 12)]
+        lsl             x9,  x9,  #4                // R[0] << 4
+        umov            w13, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = gamma.rgb[clip(G[1] >> 12)]
+        lsl             x10, x10, #4                // G[1] << 4
+        umov            w14, v3.h[3]                // clip(R[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = gamma.rgb[clip(B[2] >> 12)]
+        lsl             x11, x11, #4                // B[2] << 4
+
+        umov            w15, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G[0] = gamma.rgb[clip(G[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R[0] << 4, G[0] << 4
+        umov            w12, v3.h[2]                // clip(R[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B[1] = gamma.rgb[clip(B[1] >> 12)]
+        orr             x10, x10, x13, lsl #20      // G[1] << 4, B[1] << 4
+        umov            w13, v4.h[3]                // clip(G[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R[3] = gamma.rgb[clip(R[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B[2] << 4, R[3] << 4
+
+        umov            w14, v3.h[1]                // clip(R[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B[0] = gamma.rgb[clip(B[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R[0] << 4, G[0] << 4, B[0] << 4
+        umov            w15, v4.h[2]                // clip(G[2] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // R[2] = gamma.rgb[clip(R[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G[1] << 4, B[1] << 4, R[2] << 4
+        umov            w12, v5.h[3]                // clip(B[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G[3] = gamma.rgb[clip(G[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B[2] << 4, R[3] << 4, G[3] << 4
+
+        ldrh            w14, [x8, x14, lsl #1]      // R[1] = gamma.rgb[clip(R[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R[0] << 4, G[0] << 4, B[0] << 4, R[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G[2] = gamma.rgb[clip(G[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G[1] << 4, B[1] << 4, R[2] << 4, G[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B[3] = gamma.rgb[clip(B[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B[2] << 4, R[3] << 4, G[3] << 4, B[3] << 4
+
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+        add             x1,  x1,  #24
+
+        subs            w0,  w0,  #4
+        b.ge            7b
+
+        .align JUMP_ALIGN
+8:
+        tst             w5,  #3
+        b.eq            10f                         // no residual pixels; (w & 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X[0] Y[0]
+        ldrh            w11, [x3, #4]               // w11 = Z[0]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4,  #12          // X[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y[0] >> 4
+        lsr             w11, w11, #4                // Z[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.xyz[X[0] >> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.xyz[Y[0] >> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.xyz[Z[0] >> 4]
+
+        cmp             w0,  #-2
+        b.lt            9f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X[1] Y[1]
+        ldrh            w11, [x3, #4]               // w11 = Z[1]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y[1] >> 4
+        lsr             w11, w11, #4                // Z[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z[1] >> 4]
+        mov             v0.h[1], v16.h[0]           // v0.4h = gamma.xyz[X[0..1] >> 4]
+        mov             v1.h[1], v17.h[0]           // v1.4h = gamma.xyz[Y[0..1] >> 4]
+        mov             v2.h[1], v18.h[0]           // v2.4h = gamma.xyz[Z[0..1] >> 4]
+
+        b.le            9f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X[2] Y[2]
+        ldrh            w11, [x3, #4]               // w11 = Z[2]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y[2] >> 4
+        lsr             w11, w11, #4                // Z[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.xyz[X[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.xyz[Y[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.xyz[Z[2] >> 4]
+        mov             v0.h[2], v16.h[0]           // v0.4h = gamma.xyz[X[0..2] >> 4]
+        mov             v1.h[2], v17.h[0]           // v1.4h = gamma.xyz[Y[0..2] >> 4]
+        mov             v2.h[2], v18.h[0]           // v2.4h = gamma.xyz[Z[0..2] >> 4]
+
+        .align JUMP_ALIGN
+9:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..2] = gamma.xyz[X[0..2] >> 4] * matrix[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..2] = gamma.xyz[X[0..2] >> 4] * matrix[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..2]  = gamma.xyz[Y[0..2] >> 4] * matrix[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..2] -= gamma.xyz[Y[0..2] >> 4] * matrix[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..2] -= gamma.xyz[X[0..2] >> 4] * matrix[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..2] -= gamma.xyz[Y[0..2] >> 4] * matrix[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..2] -= gamma.xyz[Z[0..2] >> 4] * matrix[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..2] += gamma.xyz[Z[0..2] >> 4] * matrix[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..2] += gamma.xyz[Z[0..2] >> 4] * matrix[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = gamma.rgb[clip(R[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[0] = gamma.rgb[clip(G[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[0] = gamma.rgb[clip(B[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[0] << 4
+        lsl             w10, w10, #4                // w10 = G[0] << 4
+        lsl             w11, w11, #4                // w11 = B[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        cmp             w0,  #-2
+        b.lt            10f                         // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[1] = gamma.rgb[clip(R[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = gamma.rgb[clip(G[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[1] = gamma.rgb[clip(B[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[1] << 4
+        lsl             w10, w10, #4                // w10 = G[1] << 4
+        lsl             w11, w11, #4                // w11 = B[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        b.le            10f                         // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[2] = gamma.rgb[clip(R[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[2] = gamma.rgb[clip(G[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = gamma.rgb[clip(B[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[2] << 4
+        lsl             w10, w10, #4                // w10 = G[2] << 4
+        lsl             w11, w11, #4                // w11 = B[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        .align JUMP_ALIGN
+10:
+        ret
+endfunc
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index c795427a83..fc4f1f6d0c 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -860,6 +860,10 @@ void ff_rgb48Toxyz12(const SwsInternal *c, uint8_t *dst, int dst_stride,
 av_cold void ff_sws_init_xyz2rgb(SwsInternal *c)
 {
     c->xyz12Torgb48 = xyz12Torgb48_c;
+
+#if ARCH_AARCH64
+    ff_sws_init_xyz2rgb_aarch64(c);
+#endif
 }
 
 void ff_update_palette(SwsInternal *c, const uint32_t *pal)
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 107671feb2..d1aa15af36 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -729,6 +729,7 @@ av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c);
 av_cold void ff_sws_init_range_convert_x86(SwsInternal *c);
 
 av_cold void ff_sws_init_xyz2rgb(SwsInternal *c);
+av_cold void ff_sws_init_xyz2rgb_aarch64(SwsInternal *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c);
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH 3/3] swscale: Add AArch64 Neon path for xyz12Torgb48 LE

Reply via email to