This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 534757926f222804c543e037140189ab8eb677b0 Author: Ramiro Polla <[email protected]> AuthorDate: Tue Mar 10 15:40:37 2026 +0100 Commit: Ramiro Polla <[email protected]> CommitDate: Mon Mar 30 11:38:35 2026 +0000 swscale/aarch64: introduce ops_asmgen for NEON backend The NEON sws_ops backend follows the same continuation-passing style design as the x86 backend. Unlike the C and x86 backends, which implement the various operation functions through the use of templates and preprocessor macros, the NEON backend uses a build-time code generator, which is introduced by this commit. This code generator has two modes of operation: -ops: Generates an assembly file in GNU assembler syntax targeting AArch64, which implements all the sws_ops functions the NEON backend supports. -lookup: Generates a C function with a hierarchical condition chain that returns the pointer to one of the functions generated above, based on a given set of parameters derived from SwsOp. This is the core of the NEON sws_ops backend. Sponsored-by: Sovereign Tech Fund Signed-off-by: Ramiro Polla <[email protected]> --- libswscale/aarch64/.gitignore | 1 + libswscale/aarch64/Makefile | 2 + libswscale/aarch64/ops_asmgen.c | 1646 +++++++++++++++++++++++++++++++++++++++ libswscale/aarch64/ops_impl.c | 119 ++- libswscale/aarch64/ops_impl.h | 40 + 5 files changed, 1796 insertions(+), 12 deletions(-) diff --git a/libswscale/aarch64/.gitignore b/libswscale/aarch64/.gitignore new file mode 100644 index 0000000000..667c77c085 --- /dev/null +++ b/libswscale/aarch64/.gitignore @@ -0,0 +1 @@ +/ops_asmgen diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile index 1c82e34e28..3a6f40e845 100644 --- a/libswscale/aarch64/Makefile +++ b/libswscale/aarch64/Makefile @@ -10,3 +10,5 @@ NEON-OBJS += aarch64/hscale.o \ aarch64/swscale_unscaled_neon.o \ aarch64/xyz2rgb_neon.o \ aarch64/yuv2rgb_neon.o \ + +HOSTPROGS = aarch64/ops_asmgen diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c new file mode 100644 index 0000000000..1ec7fc7b5e --- /dev/null +++ b/libswscale/aarch64/ops_asmgen.c @@ -0,0 +1,1646 @@ +/* + * Copyright (C) 2026 Ramiro Polla + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <assert.h> +#include <limits.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef _WIN32 +#include <io.h> +#include <fcntl.h> +#endif + +/** + * This file is compiled as a standalone build-time tool and must not depend + * on internal FFmpeg libraries. The necessary utils are redefined below using + * standard C equivalents. + */ + +#define AVUTIL_AVASSERT_H +#define AVUTIL_LOG_H +#define AVUTIL_MACROS_H +#define AVUTIL_MEM_H +#define av_assert0(cond) assert(cond) +#define av_malloc(s) malloc(s) +#define av_mallocz(s) calloc(1, s) +#define av_realloc(p, s) realloc(p, s) +#define av_strdup(s) strdup(s) +#define av_free(p) free(p) +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) + +static void av_freep(void *ptr) +{ + void **pptr = (void **) ptr; + if (pptr) { + ptr = *pptr; + if (ptr) + free(ptr); + *pptr = NULL; + } +} + +#include "libavutil/dynarray.h" + +static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size, + const uint8_t *elem_data) +{ + uint8_t *tab_elem_data = NULL; + + FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, { + tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size; + if (elem_data) + memcpy(tab_elem_data, elem_data, elem_size); + }, { + av_freep(tab_ptr); + *nb_ptr = 0; + }); + return tab_elem_data; +} + +/*********************************************************************/ +#include "rasm.c" +#include "rasm_print.c" +#include "ops_impl.c" + +/** + * Implementation parameters for all exported functions. This list is + * compiled by performing a dummy run of all conversions in sws_ops and + * collecting all functions that need to be generated. This is achieved + * by running: + * make sws_ops_entries_aarch64 + */ +static const SwsAArch64OpImplParams impl_params[] = { +#include "ops_entries.c" + { .op = AARCH64_SWS_OP_NONE } +}; + +/*********************************************************************/ +static size_t aarch64_pixel_size(SwsAArch64PixelType fmt) +{ + switch (fmt) { + case AARCH64_PIXEL_U8: return 1; + case AARCH64_PIXEL_U16: return 2; + case AARCH64_PIXEL_U32: return 4; + case AARCH64_PIXEL_F32: return 4; + default: + av_assert0(!"Invalid pixel type!"); + break; + } + return 0; +} + +static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params) +{ + buf_appendf(buf, size, "ff_sws"); + const ParamField **fields = op_fields[params->op]; + for (int i = 0; fields[i]; i++) { + const ParamField *field = fields[i]; + void *p = (void *) (((uintptr_t) params) + field->offset); + field->print_str(buf, size, p); + } + buf_appendf(buf, size, "_neon"); +} + +void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params) +{ + impl_func_name(&buf, &size, params); + av_assert0(size && "string buffer exhausted"); +} + +/*********************************************************************/ +typedef struct SwsAArch64Context { + RasmContext *rctx; + + /* SwsOpFunc arguments. */ + RasmOp exec; + RasmOp impl; + RasmOp bx_start; + RasmOp y_start; + RasmOp bx_end; + RasmOp y_end; + + /* Loop iterator variables. */ + RasmOp bx; + RasmOp y; + + /* Scratch registers. */ + RasmOp tmp0; + RasmOp tmp1; + + /* CPS-related variables. */ + RasmOp op0_func; + RasmOp op1_impl; + RasmOp cont; + + /* Vector registers. Two banks (low and high) are used. */ + RasmOp vl[ 4]; + RasmOp vh[ 4]; + RasmOp vt[12]; + + /* Read/Write data pointers and padding. */ + RasmOp in[4]; + RasmOp out[4]; + RasmOp in_bump[4]; + RasmOp out_bump[4]; + + /* Vector register dimensions. */ + size_t el_size; + size_t el_count; + size_t vec_size; + bool use_vh; +} SwsAArch64Context; + +/*********************************************************************/ +/* Helpers functions. */ + +/* Looping when s->use_vh is set. */ +#define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx) +#define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx) +#define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx) + +/* Inline rasm comments. */ +#define CMT(comment) rasm_annotate(r, comment) +#define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__) + +/* Reshape all vector registers for current SwsOp. */ +static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size) +{ + s->vl[ 0] = a64op_make_vec( 0, el_count, el_size); + s->vl[ 1] = a64op_make_vec( 1, el_count, el_size); + s->vl[ 2] = a64op_make_vec( 2, el_count, el_size); + s->vl[ 3] = a64op_make_vec( 3, el_count, el_size); + s->vh[ 0] = a64op_make_vec( 4, el_count, el_size); + s->vh[ 1] = a64op_make_vec( 5, el_count, el_size); + s->vh[ 2] = a64op_make_vec( 6, el_count, el_size); + s->vh[ 3] = a64op_make_vec( 7, el_count, el_size); + s->vt[ 0] = a64op_make_vec(16, el_count, el_size); + s->vt[ 1] = a64op_make_vec(17, el_count, el_size); + s->vt[ 2] = a64op_make_vec(18, el_count, el_size); + s->vt[ 3] = a64op_make_vec(19, el_count, el_size); + s->vt[ 4] = a64op_make_vec(20, el_count, el_size); + s->vt[ 5] = a64op_make_vec(21, el_count, el_size); + s->vt[ 6] = a64op_make_vec(22, el_count, el_size); + s->vt[ 7] = a64op_make_vec(23, el_count, el_size); + s->vt[ 8] = a64op_make_vec(24, el_count, el_size); + s->vt[ 9] = a64op_make_vec(25, el_count, el_size); + s->vt[10] = a64op_make_vec(26, el_count, el_size); + s->vt[11] = a64op_make_vec(27, el_count, el_size); +} + +/*********************************************************************/ +/* Function frame */ + +static unsigned clobbered_frame_size(unsigned n) +{ + return ((n + 1) >> 1) * 16; +} + +static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n) +{ + RasmContext *r = s->rctx; + RasmOp sp = a64op_sp(); + unsigned frame_size = clobbered_frame_size(n); + RasmOp sp_pre = a64op_pre(sp, -frame_size); + + rasm_add_comment(r, "prologue"); + if (n == 0) { + /* no-op */ + } else if (n == 1) { + i_str(r, regs[0], sp_pre); + } else { + i_stp(r, regs[0], regs[1], sp_pre); + for (unsigned i = 2; i + 1 < n; i += 2) + i_stp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t))); + if (n & 1) + i_str(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t))); + } +} + +static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n) +{ + RasmContext *r = s->rctx; + RasmOp sp = a64op_sp(); + unsigned frame_size = clobbered_frame_size(n); + RasmOp sp_post = a64op_post(sp, frame_size); + + rasm_add_comment(r, "epilogue"); + if (n == 0) { + /* no-op */ + } else if (n == 1) { + i_ldr(r, regs[0], sp_post); + } else { + if (n & 1) + i_ldr(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t))); + for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2) + i_ldp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t))); + i_ldp(r, regs[0], regs[1], sp_post); + } +} + +/*********************************************************************/ +/* Callee-saved registers (r19-r28). */ +#define MAX_SAVED_REGS 10 + +static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count, + RasmOp gpr) +{ + const int n = a64op_gpr_n(gpr); + if (n >= 19 && n <= 28) + regs[(*count)++] = gpr; +} + +static unsigned clobbered_gprs(const SwsAArch64Context *s, + const SwsAArch64OpImplParams *p, + RasmOp regs[MAX_SAVED_REGS]) +{ + unsigned count = 0; + LOOP_MASK(p, i) { + clobber_gpr(regs, &count, s->in[i]); + clobber_gpr(regs, &count, s->out[i]); + clobber_gpr(regs, &count, s->in_bump[i]); + clobber_gpr(regs, &count, s->out_bump[i]); + } + return count; +} + +static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + char func_name[128]; + char buf[64]; + + /** + * The process/process_return functions for aarch64 work similarly + * to the x86 backend. The description in x86/ops_common.asm mostly + * holds as well here. + */ + + aarch64_op_impl_func_name(func_name, sizeof(func_name), p); + + rasm_func_begin(r, func_name, true); + + /* Function prologue */ + RasmOp saved_regs[MAX_SAVED_REGS]; + unsigned nsaved = clobbered_gprs(s, p, saved_regs); + if (nsaved) + asmgen_prologue(s, saved_regs, nsaved); + + /* Load values from impl. */ + i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr op0_func = impl->cont;"); + i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl)); CMT("SwsOpImpl *op1_impl = impl + 1;"); + + /* Load values from exec. */ + LOOP_MASK(p, i) { + rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, i); + i_ldr(r, s->in[i], a64op_off(s->exec, offsetof_exec_in + (i * sizeof(uint8_t *)))); + } + LOOP_MASK(p, i) { + rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", i, i); + i_ldr(r, s->out[i], a64op_off(s->exec, offsetof_exec_out + (i * sizeof(uint8_t *)))); + } + LOOP_MASK(p, i) { + rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = exec->in_bump[%u];", i, i); + i_ldr(r, s->in_bump[i], a64op_off(s->exec, offsetof_exec_in_bump + (i * sizeof(ptrdiff_t)))); + } + LOOP_MASK(p, i) { + rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = exec->out_bump[%u];", i, i); + i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t)))); + } + + /* Reset x and jump to first kernel. */ + i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;"); + i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;"); + i_br (r, s->op0_func); CMT("jump to op0_func"); +} + +static void asmgen_process_return(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + char func_name[128]; + + aarch64_op_impl_func_name(func_name, sizeof(func_name), p); + + rasm_func_begin(r, func_name, true); + + /* Reset impl to first kernel. */ + i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;"); + + /* Perform horizontal loop. */ + int loop = rasm_new_label(r, NULL); + i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;"); + i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)"); + i_bne(r, loop); CMT(" goto loop;"); + + /* Perform vertical loop. */ + int end = rasm_new_label(r, NULL); + i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;"); + i_cmp(r, s->y, s->y_end); CMT("if (y == y_end)"); + i_beq(r, end); CMT(" goto end;"); + + /* Perform padding and reset x, preparing for next row. */ + LOOP_MASK(p, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); } + LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); } + i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;"); + + /* Loop back or end of function. */ + rasm_add_label(r, loop); CMT("loop:"); + i_br (r, s->op0_func); CMT("jump to op0_func"); + rasm_add_label(r, end); CMT("end:"); + + /* Function epilogue */ + RasmOp saved_regs[MAX_SAVED_REGS]; + unsigned nsaved = clobbered_gprs(s, p, saved_regs); + if (nsaved) + asmgen_epilogue(s, saved_regs, nsaved); + + i_ret(r); +} + +/*********************************************************************/ +/* gather raw pixels from planes */ +/* AARCH64_SWS_OP_READ_BIT */ +/* AARCH64_SWS_OP_READ_NIBBLE */ +/* AARCH64_SWS_OP_READ_PACKED */ +/* AARCH64_SWS_OP_READ_PLANAR */ + +static void asmgen_op_read_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp bitmask_vec = s->vt[1]; + RasmOp wtmp = a64op_w(s->tmp0); + AArch64VecViews vl[1]; + AArch64VecViews vtmp; + AArch64VecViews shift_vec; + + a64op_vec_views(s->vt[0], &shift_vec); + a64op_vec_views(s->vl[0], &vl[0]); + a64op_vec_views(s->vt[2], &vtmp); + + /* Note that shift_vec has negative values, so that using it with + * ushl actually performs a right shift. */ + rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;"); + i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv)); + + if (p->block_size == 16) { + i_ldrh(r, wtmp, a64op_post(s->in[0], 2)); CMT("uint16_t tmp = *in[0]++;"); + i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 16 times>};"); + i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);"); + i_lsr (r, wtmp, wtmp, IMM(8)); CMT("tmp >>= 8;"); + i_dup (r, vtmp.b8, wtmp); CMT("vtmp.lo = broadcast(tmp);"); + i_ins (r, vl[0].de[1], vtmp.de[0]); CMT("vl[0].hi = vtmp.lo;"); + i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;"); + i_and (r, vl[0].b16, vl[0].b16, bitmask_vec); CMT("vl[0] &= bitmask_vec;"); + } else { + i_ldrb(r, wtmp, a64op_post(s->in[0], 1)); CMT("uint8_t tmp = *in[0]++;"); + i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};"); + i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);"); + i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;"); + i_and (r, vl[0].b8, vl[0].b8, bitmask_vec); CMT("vl[0] &= bitmask_vec;"); + } +} + +static void asmgen_op_read_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp nibble_mask = v_8b(s->vt[0]); + AArch64VecViews vl[1]; + AArch64VecViews vtmp; + + a64op_vec_views(s->vl[0], &vl[0]); + a64op_vec_views(s->vt[1], &vtmp); + + rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 <repeats 8 times>};"); + i_movi(r, nibble_mask, IMM(0x0f)); + + if (p->block_size == 8) { + i_ldr (r, vl[0].s, a64op_post(s->in[0], 4)); CMT("vl[0] = *in[0]++;"); + i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;"); + i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;"); + i_zip1(r, vl[0].b8, vtmp.b8, vl[0].b8); CMT("interleave"); + } else { + i_ldr (r, vl[0].d, a64op_post(s->in[0], 8)); CMT("vl[0] = *in[0]++;"); + i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;"); + i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;"); + i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16); CMT("interleave"); + } +} + +static void asmgen_op_read_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[1]; + AArch64VecViews vh[1]; + + a64op_vec_views(s->vl[0], &vl[0]); + a64op_vec_views(s->vh[0], &vh[0]); + + switch ((s->use_vh ? 0x100 : 0) | s->vec_size) { + case 0x008: i_ldr(r, vl[0].d, a64op_post(s->in[0], s->vec_size * 1)); break; + case 0x010: i_ldr(r, vl[0].q, a64op_post(s->in[0], s->vec_size * 1)); break; + case 0x108: i_ldp(r, vl[0].d, vh[0].d, a64op_post(s->in[0], s->vec_size * 2)); break; + case 0x110: i_ldp(r, vl[0].q, vh[0].q, a64op_post(s->in[0], s->vec_size * 2)); break; + } +} + +static void asmgen_op_read_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx) +{ + RasmContext *r = s->rctx; + + switch (p->mask) { + case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]), a64op_post(s->in[0], s->vec_size * 2)); break; + case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->in[0], s->vec_size * 3)); break; + case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->in[0], s->vec_size * 4)); break; + } +} + +static void asmgen_op_read_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + if (p->mask == 0x0001) { + asmgen_op_read_packed_1(s, p); + } else { + asmgen_op_read_packed_n(s, p, s->vl); + if (s->use_vh) + asmgen_op_read_packed_n(s, p, s->vh); + } +} + +static void asmgen_op_read_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[4]; + AArch64VecViews vh[4]; + + for (int i = 0; i < 4; i++) { + a64op_vec_views(s->vl[i], &vl[i]); + a64op_vec_views(s->vh[i], &vh[i]); + } + + LOOP_MASK(p, i) { + switch ((s->use_vh ? 0x100 : 0) | s->vec_size) { + case 0x008: i_ldr(r, vl[i].d, a64op_post(s->in[i], s->vec_size * 1)); break; + case 0x010: i_ldr(r, vl[i].q, a64op_post(s->in[i], s->vec_size * 1)); break; + case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], s->vec_size * 2)); break; + case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], s->vec_size * 2)); break; + } + } +} + +/*********************************************************************/ +/* write raw pixels to planes */ +/* AARCH64_SWS_OP_WRITE_BIT */ +/* AARCH64_SWS_OP_WRITE_NIBBLE */ +/* AARCH64_SWS_OP_WRITE_PACKED */ +/* AARCH64_SWS_OP_WRITE_PLANAR */ + +static void asmgen_op_write_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[1]; + AArch64VecViews shift_vec; + AArch64VecViews vtmp0; + AArch64VecViews vtmp1; + + a64op_vec_views(s->vl[0], &vl[0]); + a64op_vec_views(s->vt[0], &shift_vec); + a64op_vec_views(s->vt[1], &vtmp0); + a64op_vec_views(s->vt[2], &vtmp1); + + rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;"); + i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv)); + + if (p->block_size == 8) { + i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;"); + i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);"); + i_str (r, vtmp0.b, a64op_post(s->out[0], 1)); CMT("*out[0]++ = vtmp0;"); + } else { + i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;"); + i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);"); + i_ins (r, vtmp1.de[0], vl[0].de[1]); CMT("vtmp1.lo = vl[0].hi;"); + i_addv(r, vtmp1.b, vtmp1.b8); CMT("vtmp1[0] = add_across(vtmp1);"); + i_ins (r, vtmp0.be[1], vtmp1.be[0]); CMT("vtmp0[1] = vtmp1[0];"); + i_str (r, vtmp0.h, a64op_post(s->out[0], 2)); CMT("*out[0]++ = vtmp0;"); + } +} + +static void asmgen_op_write_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[4]; + AArch64VecViews vtmp0; + AArch64VecViews vtmp1; + + for (int i = 0; i < 4; i++) + a64op_vec_views(s->vl[i], &vl[i]); + a64op_vec_views(s->vt[0], &vtmp0); + a64op_vec_views(s->vt[1], &vtmp1); + + if (p->block_size == 8) { + i_shl (r, vtmp0.h4, vl[0].h4, IMM(4)); + i_ushr(r, vtmp1.h4, vl[0].h4, IMM(8)); + i_orr (r, vl[0].b8, vtmp0.b8, vtmp1.b8); + i_xtn (r, vtmp0.b8, vl[0].h8); + i_str (r, vtmp0.s, a64op_post(s->out[0], 4)); + } else { + i_shl (r, vtmp0.h8, vl[0].h8, IMM(4)); + i_ushr(r, vtmp1.h8, vl[0].h8, IMM(8)); + i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16); + i_xtn (r, vtmp0.b8, vl[0].h8); + i_str (r, vtmp0.d, a64op_post(s->out[0], 8)); + } +} + +static void asmgen_op_write_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[1]; + AArch64VecViews vh[1]; + + a64op_vec_views(s->vl[0], &vl[0]); + a64op_vec_views(s->vh[0], &vh[0]); + + switch ((s->use_vh ? 0x100 : 0) | s->vec_size) { + case 0x008: i_str(r, vl[0].d, a64op_post(s->out[0], s->vec_size * 1)); break; + case 0x010: i_str(r, vl[0].q, a64op_post(s->out[0], s->vec_size * 1)); break; + case 0x108: i_stp(r, vl[0].d, vh[0].d, a64op_post(s->out[0], s->vec_size * 2)); break; + case 0x110: i_stp(r, vl[0].q, vh[0].q, a64op_post(s->out[0], s->vec_size * 2)); break; + } +} + +static void asmgen_op_write_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx) +{ + RasmContext *r = s->rctx; + + switch (p->mask) { + case 0x0011: i_st2(r, vv_2(vx[0], vx[1]), a64op_post(s->out[0], s->vec_size * 2)); break; + case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->out[0], s->vec_size * 3)); break; + case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->out[0], s->vec_size * 4)); break; + } +} + +static void asmgen_op_write_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + if (p->mask == 0x0001) { + asmgen_op_write_packed_1(s, p); + } else { + asmgen_op_write_packed_n(s, p, s->vl); + if (s->use_vh) + asmgen_op_write_packed_n(s, p, s->vh); + } +} + +static void asmgen_op_write_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[4]; + AArch64VecViews vh[4]; + + for (int i = 0; i < 4; i++) { + a64op_vec_views(s->vl[i], &vl[i]); + a64op_vec_views(s->vh[i], &vh[i]); + } + + LOOP_MASK(p, i) { + switch ((s->use_vh ? 0x100 : 0) | s->vec_size) { + case 0x008: i_str(r, vl[i].d, a64op_post(s->out[i], s->vec_size * 1)); break; + case 0x010: i_str(r, vl[i].q, a64op_post(s->out[i], s->vec_size * 1)); break; + case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], s->vec_size * 2)); break; + case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], s->vec_size * 2)); break; + } + } +} + +/*********************************************************************/ +/* swap byte order (for differing endianness) */ +/* AARCH64_SWS_OP_SWAP_BYTES */ + +static void asmgen_op_swap_bytes(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[4]; + AArch64VecViews vh[4]; + + for (int i = 0; i < 4; i++) { + a64op_vec_views(s->vl[i], &vl[i]); + a64op_vec_views(s->vh[i], &vh[i]); + } + + switch (aarch64_pixel_size(p->type)) { + case sizeof(uint16_t): + LOOP_MASK (p, i) i_rev16(r, vl[i].b16, vl[i].b16); + LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16); + break; + case sizeof(uint32_t): + LOOP_MASK (p, i) i_rev32(r, vl[i].b16, vl[i].b16); + LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16); + break; + } +} + +/*********************************************************************/ +/* rearrange channel order, or duplicate channels */ +/* AARCH64_SWS_OP_SWIZZLE */ + +#define SWIZZLE_TMP 0xf + +static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh) +{ + if (n == SWIZZLE_TMP) + snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l'); + else + snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n); + return buf; +} +#define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh) + +static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh) +{ + if (n == SWIZZLE_TMP) + return s->vt[vh]; + return vh ? s->vh[n] : s->vl[n]; +} + +static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src) +{ + RasmContext *r = s->rctx; + RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) }; + RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) }; + + i_mov (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 0), PRINT_SWIZZLE_V(src, 0)); + if (s->use_vh) { + i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 1), PRINT_SWIZZLE_V(src, 1)); + } +} + +static void asmgen_op_swizzle(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + /* Compute used vectors (src and dst) */ + uint8_t src_used[4] = { 0 }; + bool done[4] = { true, true, true, true }; + LOOP_MASK(p, dst) { + uint8_t src = MASK_GET(p->swizzle, dst); + src_used[src]++; + done[dst] = false; + } + + /* First perform unobstructed copies. */ + for (bool progress = true; progress; ) { + progress = false; + for (int dst = 0; dst < 4; dst++) { + if (done[dst] || src_used[dst]) + continue; + uint8_t src = MASK_GET(p->swizzle, dst); + swizzle_emit(s, dst, src); + src_used[src]--; + done[dst] = true; + progress = true; + } + } + + /* Then swap and rotate remaining operations. */ + for (int dst = 0; dst < 4; dst++) { + if (done[dst]) + continue; + + swizzle_emit(s, SWIZZLE_TMP, dst); + + uint8_t cur_dst = dst; + uint8_t src = MASK_GET(p->swizzle, cur_dst); + while (src != dst) { + swizzle_emit(s, cur_dst, src); + done[cur_dst] = true; + cur_dst = src; + src = MASK_GET(p->swizzle, cur_dst); + } + + swizzle_emit(s, cur_dst, SWIZZLE_TMP); + done[cur_dst] = true; + } +} + +#undef SWIZZLE_TMP + +/*********************************************************************/ +/* split tightly packed data into components */ +/* AARCH64_SWS_OP_UNPACK */ + +static void asmgen_op_unpack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp *vt = s->vt; + RasmOp mask_gpr = a64op_w(s->tmp0); + uint32_t mask_val[4] = { 0 }; + uint8_t mask_idx[4] = { 0 }; + uint8_t cur_vt = 0; + + const int offsets[4] = { + MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1), + MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2), + MASK_GET(p->pack, 3), + 0 + }; + + /* Generate masks. */ + rasm_add_comment(r, "generate masks"); + LOOP_MASK(p, i) { + uint32_t val = (1u << MASK_GET(p->pack, i)) - 1; + for (int j = 0; j < 4; j++) { + if (mask_val[j] == val) { + mask_val[i] = mask_val[j]; + mask_idx[i] = mask_idx[j]; + break; + } + } + if (!mask_val[i]) { + /** + * All-one values in movi only work up to 8-bit, and then + * at full 16- or 32-bit, but not for intermediate values + * like 10-bit. In those cases, we use mov + dup instead. + */ + if (val <= 0xff || val == 0xffff) { + i_movi(r, vt[cur_vt], IMM(val)); + } else { + i_mov (r, mask_gpr, IMM(val)); + i_dup (r, vt[cur_vt], mask_gpr); + } + mask_val[i] = val; + mask_idx[i] = cur_vt++; + } + } + + /* Loop backwards to avoid clobbering component 0. */ + LOOP_MASK_BWD (p, i) { + if (offsets[i]) { + i_ushr (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", i, offsets[i]); + } else if (i) { + i_mov16b(r, vl[i], vl[0]); CMTF("vl[%u] = vl[0];", i); + } + } + LOOP_MASK_BWD_VH(s, p, i) { + if (offsets[i]) { + i_ushr (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", i, offsets[i]); + } else if (i) { + i_mov16b(r, vh[i], vh[0]); CMTF("vh[%u] = vh[0];", i); + } + } + + /* Apply masks. */ + reshape_all_vectors(s, 16, 1); + LOOP_MASK_BWD (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); } + LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); } +} + +/*********************************************************************/ +/* compress components into tightly packed data */ +/* AARCH64_SWS_OP_PACK */ + +static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + + const int offsets[4] = { + MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1), + MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2), + MASK_GET(p->pack, 3), + 0 + }; + uint16_t offset_mask = 0; + LOOP_MASK(p, i) { + if (offsets[i]) + MASK_SET(offset_mask, i, 1); + } + + /* Perform left shift. */ + LOOP (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); CMTF("vl[%u] <<= %u;", i, offsets[i]); } + LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); CMTF("vh[%u] <<= %u;", i, offsets[i]); } + + /* Combine components. */ + reshape_all_vectors(s, 16, 1); + LOOP_MASK (p, i) { + if (i != 0) { + i_orr (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i); + if (s->use_vh) { + i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i); + } + } + } +} + +/*********************************************************************/ +/* logical left shift of raw pixel values by (u8) */ +/* AARCH64_SWS_OP_LSHIFT */ + +static void asmgen_op_lshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + + LOOP_MASK (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] <<= %u;", i, p->shift); } + LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] <<= %u;", i, p->shift); } +} + +/*********************************************************************/ +/* right shift of raw pixel values by (u8) */ +/* AARCH64_SWS_OP_RSHIFT */ + +static void asmgen_op_rshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + + LOOP_MASK (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] >>= %u;", i, p->shift); } + LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] >>= %u;", i, p->shift); } +} + +/*********************************************************************/ +/* clear pixel values */ +/* AARCH64_SWS_OP_CLEAR */ + +static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp clear_vec = s->vt[0]; + + /** + * TODO + * - pack elements in impl->priv and perform smaller loads + * - if only 1 element and not vh, load directly with ld1r + */ + + i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 clear_vec = impl->priv.v128;"); + + LOOP_MASK (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i)); CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); } + LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i)); CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); } +} + +/*********************************************************************/ +/* convert (cast) between formats */ +/* AARCH64_SWS_OP_CONVERT */ + +static void asmgen_op_convert(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + AArch64VecViews vl[4]; + AArch64VecViews vh[4]; + + /** + * Since each instruction in the convert operation needs specific + * element types, it is simpler to use arrangement specifiers for + * each operand instead of reshaping all vectors. + */ + + for (int i = 0; i < 4; i++) { + a64op_vec_views(s->vl[i], &vl[i]); + a64op_vec_views(s->vh[i], &vh[i]); + } + + size_t src_el_size = s->el_size; + size_t dst_el_size = aarch64_pixel_size(p->to_type); + + /** + * This function assumes block_size is either 8 or 16, and that + * we're always using the most amount of vector registers possible. + * Therefore, u32 always uses the high vector bank. + */ + if (p->type == AARCH64_PIXEL_F32) { + rasm_add_comment(r, "f32 -> u32"); + LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4); + LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4); + } + + if (p->block_size == 8) { + if (src_el_size == 1 && dst_el_size > src_el_size) { + rasm_add_comment(r, "u8 -> u16"); + LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8); + src_el_size = 2; + } else if (src_el_size == 4 && dst_el_size < src_el_size) { + rasm_add_comment(r, "u32 -> u16"); + LOOP_MASK(p, i) i_xtn (r, vl[i].h4, vl[i].s4); + LOOP_MASK(p, i) i_xtn (r, vh[i].h4, vh[i].s4); + LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]); + src_el_size = 2; + } + if (src_el_size == 2 && dst_el_size == 4) { + rasm_add_comment(r, "u16 -> u32"); + LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4, vl[i].h8); + LOOP_MASK(p, i) i_uxtl (r, vl[i].s4, vl[i].h4); + src_el_size = 4; + } else if (src_el_size == 2 && dst_el_size == 1) { + rasm_add_comment(r, "u16 -> u8"); + LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8); + src_el_size = 1; + } + } else /* if (p->block_size == 16) */ { + if (src_el_size == 1 && dst_el_size == 2) { + rasm_add_comment(r, "u8 -> u16"); + LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8, vl[i].b16); + LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8); + } else if (src_el_size == 2 && dst_el_size == 1) { + rasm_add_comment(r, "u16 -> u8"); + LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8); + LOOP_MASK(p, i) i_xtn (r, vh[i].b8, vh[i].h8); + LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]); + } + } + + /* See comment above for high vector bank usage for u32. */ + if (p->to_type == AARCH64_PIXEL_F32) { + rasm_add_comment(r, "u32 -> f32"); + LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4); + LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4); + } +} + +/*********************************************************************/ +/* expand integers to the full range */ +/* AARCH64_SWS_OP_EXPAND */ + +static void asmgen_op_expand(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + + size_t src_el_size = s->el_size; + size_t dst_el_size = aarch64_pixel_size(p->to_type); + size_t dst_total_size = p->block_size * dst_el_size; + size_t dst_vec_size = FFMIN(dst_total_size, 16); + + if (!s->use_vh) + s->use_vh = (dst_vec_size != dst_total_size); + + if (src_el_size == 1) { + rasm_add_comment(r, "u8 -> u16"); + reshape_all_vectors(s, 16, 1); + LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]); + LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]); + } + if (dst_el_size == 4) { + rasm_add_comment(r, "u16 -> u32"); + reshape_all_vectors(s, 8, 2); + LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]); + LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]); + } +} + +/*********************************************************************/ +/* numeric minimum (q4) */ +/* AARCH64_SWS_OP_MIN */ + +static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp *vt = s->vt; + RasmOp min_vec = s->vt[4]; + + i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 min_vec = impl->priv.v128;"); + LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i)); CMTF("v128 vmin%u = min_vec[%u];", i, i); } + + if (p->type == AARCH64_PIXEL_F32) { + LOOP_MASK (p, i) { i_fmin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); } + LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); } + } else { + LOOP_MASK (p, i) { i_umin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); } + LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); } + } +} + +/*********************************************************************/ +/* numeric maximum (q4) */ +/* AARCH64_SWS_OP_MAX */ + +static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp *vt = s->vt; + RasmOp max_vec = s->vt[4]; + + i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 max_vec = impl->priv.v128;"); + LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i)); CMTF("v128 vmax%u = max_vec[%u];", i, i); } + + if (p->type == AARCH64_PIXEL_F32) { + LOOP_MASK (p, i) { i_fmax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); } + LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); } + } else { + LOOP_MASK (p, i) { i_umax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); } + LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); } + } +} + +/*********************************************************************/ +/* multiplication by scalar (q) */ +/* AARCH64_SWS_OP_SCALE */ + +static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp priv_ptr = s->tmp0; + RasmOp scale_vec = s->vt[0]; + + i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv)); CMT("v128 *scale_vec_ptr = &impl->priv;"); + i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr)); CMT("v128 scale_vec = broadcast(*scale_vec_ptr);"); + + if (p->type == AARCH64_PIXEL_F32) { + LOOP_MASK (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); } + LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); } + } else { + LOOP_MASK (p, i) { i_mul (r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); } + LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); } + } +} + +/*********************************************************************/ +/* generalized linear affine transform */ +/* AARCH64_SWS_OP_LINEAR */ + +/** + * Performs one pass of the linear transform over a single vector bank + * (low or high). + */ +static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, + RasmOp *vt, RasmOp *vc, + int save_mask, bool vh_pass) +{ + RasmContext *r = s->rctx; + /** + * The intermediate registers for fmul+fadd (for when SWS_BITEXACT + * is set) start from temp vector 4. + */ + RasmOp *vtmp = &vt[4]; + RasmOp *vx = vh_pass ? s->vh : s->vl; + char cvh = vh_pass ? 'h' : 'l'; + + if (vh_pass && !s->use_vh) + return; + + /** + * Save rows that need to be used as input after they have been already + * written to. + */ + RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] }; + if (save_mask) { + for (int i = 0; i < 4; i++) { + if (MASK_GET(save_mask, i)) { + src_vx[i] = vt[i]; + i_mov16b(r, vt[i], vx[i]); CMTF("vsrc[%u] = v%c[%u];", i, cvh, i); + } + } + } + + /** + * The non-zero coefficients have been packed in aarch64_setup_linear() + * in sequential order into the individual lanes of the coefficient + * vector registers. We must follow the same order of execution here. + */ + int i_coeff = 0; + LOOP_MASK(p, i) { + bool first = true; + RasmNode *pre_mul = rasm_get_current_node(r); + for (int j = 0; j < 5; j++) { + if (!LINEAR_MASK_GET(p->linear.mask, i, j)) + continue; + bool is_offset = linear_index_is_offset(j); + int src_j = linear_index_to_vx(j); + RasmOp vsrc = src_vx[src_j]; + uint8_t vc_i = i_coeff / 4; + uint8_t vc_j = i_coeff & 3; + RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j); + i_coeff++; + if (first && is_offset) { + i_dup (r, vx[i], vcoeff); CMTF("v%c[%u] = broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j); + } else if (first && !is_offset) { + if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) { + i_mov16b(r, vx[i], vsrc); CMTF("v%c[%u] = vsrc[%u];", cvh, i, src_j); + } else { + i_fmul (r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] = vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j); + } + } else if (!p->linear.fmla) { + /** + * Split the multiply-accumulate into fmul+fadd. All + * multiplications are performed first into temporary + * registers, and only then added to the destination, + * to reduce the dependency chain. + * There is no need to perform multiplications by 1. + */ + if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) { + pre_mul = rasm_set_current_node(r, pre_mul); + i_fmul(r, vtmp[vc_j], vsrc, vcoeff); CMTF("vtmp[%u] = vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j); + pre_mul = rasm_set_current_node(r, pre_mul); + i_fadd(r, vx[i], vx[i], vtmp[vc_j]); CMTF("v%c[%u] += vtmp[%u];", cvh, i, vc_j); + } else { + i_fadd(r, vx[i], vx[i], vsrc); CMTF("v%c[%u] += vsrc[%u];", cvh, i, vc_j); + } + } else { + /** + * Most modern aarch64 cores have a fastpath for sequences + * of fmla instructions. This means that even if the coefficient + * is 1, it is still faster to use fmla by 1 instead of fadd. + */ + i_fmla(r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] += vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j); + } + first = false; + } + } +} + +static void asmgen_op_linear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vt = s->vt; + RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp vector 8 */ + RasmOp ptr = s->tmp0; + RasmOp coeff_veclist; + + /* Preload coefficients from impl->priv. */ + const int num_vregs = linear_num_vregs(p); + av_assert0(num_vregs <= 4); + switch (num_vregs) { + case 1: coeff_veclist = vv_1(vc[0]); break; + case 2: coeff_veclist = vv_2(vc[0], vc[1]); break; + case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]); break; + case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break; + } + i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 *vcoeff_ptr = impl->priv.ptr;"); + i_ld1(r, coeff_veclist, a64op_base(ptr)); CMT("coeff_veclist = *vcoeff_ptr;"); + + /* Compute mask for rows that must be saved before being overwritten. */ + uint16_t save_mask = 0; + bool overwritten[4] = { false, false, false, false }; + LOOP_MASK(p, i) { + for (int j = 0; j < 5; j++) { + if (!LINEAR_MASK_GET(p->linear.mask, i, j)) + continue; + bool is_offset = linear_index_is_offset(j); + int src_j = linear_index_to_vx(j); + if (!is_offset && overwritten[src_j]) + MASK_SET(save_mask, j - 1, 1); + overwritten[i] = true; + } + } + + /* Perform linear passes for low and high vector banks. */ + linear_pass(s, p, vt, vc, save_mask, false); + linear_pass(s, p, vt, vc, save_mask, true); +} + +/*********************************************************************/ +/* add dithering noise */ +/* AARCH64_SWS_OP_DITHER */ + +static void asmgen_op_dither(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + RasmOp *vl = s->vl; + RasmOp *vh = s->vh; + RasmOp ptr = s->tmp0; + RasmOp tmp1 = s->tmp1; + RasmOp wtmp1 = a64op_w(tmp1); + RasmOp dither_vl = s->vt[0]; + RasmOp dither_vh = s->vt[1]; + RasmOp bx64 = a64op_x(s->bx); + RasmOp y64 = a64op_x(s->y); + + /** + * For a description of the matrix buffer layout, read the comments + * in aarch64_setup_dither() in aarch64/ops.c. + */ + + /** + * Sort components by y_offset value so that we can start dithering + * with the smallest value, and increment the pointer upwards for + * each new offset. The dither matrix is over-allocated and may be + * over-read at the top, but it cannot be over-read before the start + * of the buffer. Since we only mask the y offset once, this would + * be an issue if we tried to subtract a value larger than the + * initial y_offset. + */ + int sorted[4]; + int n_comps = 0; + /* Very cheap bucket sort. */ + int max_offset = 0; + LOOP_MASK(p, i) + max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i)); + for (int y_off = 0; y_off <= max_offset; y_off++) { + LOOP_MASK(p, i) { + if (MASK_GET(p->dither.y_offset, i) == y_off) + sorted[n_comps++] = i; + } + } + + i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("void *ptr = impl->priv.ptr;"); + + /** + * We use ubfiz to mask and shift left in one single instruction: + * ubfiz <Wd>, <Wn>, #<lsb>, #<width> + * Wd = (Wn & ((1 << width) - 1)) << lsb; + * + * Given: + * block_size = 8, log2(block_size) = 3 + * dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111 + * sizeof(float) = 4, log2(sizeof(float)) = 2 + * + * Suppose we have bx = 0bvvvv. To get x, we left shift by + * log2(block_size) and end up with 0bvvvv000. Then we mask against + * dither_mask, and end up with 0bv000. Finally we multiply by + * sizeof(float), which is the same as shifting left by + * log2(sizeof(float)). The result is 0bv00000. + * + * Therefore: + * width = log2(dither_size) - log2(block_size) + * lsb = log2(block_size) + log2(sizeof(float)) + */ + const int block_size_log2 = (p->block_size == 16) ? 4 : 3; + const int dither_size_log2 = p->dither.size_log2; + const int sizeof_float_log2 = 2; + if (dither_size_log2 != block_size_log2) { + RasmOp lsb = IMM(block_size_log2 + sizeof_float_log2); + RasmOp width = IMM(dither_size_log2 - block_size_log2); + i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / block_size) - 1)) * block_size * sizeof(float);"); + i_add (r, ptr, ptr, tmp1); CMT("ptr += tmp1;"); + } + + int last_y_off = -1; + int prev_i = 0; + for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) { + int i = sorted[sorted_i]; + uint8_t y_off = MASK_GET(p->dither.y_offset, i); + bool do_load = (y_off != last_y_off); + + if (last_y_off < 0) { + /* On the first run, calculate pointer inside dither_matrix. */ + RasmOp lsb = IMM(dither_size_log2 + sizeof_float_log2); + RasmOp width = IMM(dither_size_log2); + /** + * The ubfiz instruction for the y offset performs masking + * by the dither matrix size and shifts by the stride. + */ + if (y_off == 0) { + i_ubfiz(r, tmp1, y64, lsb, width); CMT("tmp1 = (y & (dither_size - 1)) * dither_size * sizeof(float);"); + } else { + i_add (r, wtmp1, s->y, IMM(y_off)); CMTF("tmp1 = y + y_off[%u];", i); + i_ubfiz(r, tmp1, tmp1, lsb, width); CMT("tmp1 = (tmp1 & (dither_size - 1)) * dither_size * sizeof(float);"); + } + i_add(r, ptr, ptr, tmp1); CMT("ptr += tmp1;"); + } else if (do_load) { + /** + * On subsequent runs, just increment the pointer. + * The matrix is over-allocated, so we don't risk + * overreading. + */ + int delta = (y_off - last_y_off) * (1 << dither_size_log2) * sizeof(float); + i_add(r, ptr, ptr, IMM(delta)); CMTF("ptr += (y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i); + } + + if (do_load) { + RasmOp dither_vlq = v_q(dither_vl); + RasmOp dither_vhq = v_q(dither_vh); + i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ ditherl, ditherh } = *ptr;"); + } + + i_fadd (r, vl[i], vl[i], dither_vl); CMTF("vl[%u] += vditherl;", i); + if (s->use_vh) { + i_fadd(r, vh[i], vh[i], dither_vh); CMTF("vh[%u] += vditherh;", i); + } + + last_y_off = y_off; + prev_i = i; + } +} + +/*********************************************************************/ +static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + RasmContext *r = s->rctx; + + char func_name[128]; + aarch64_op_impl_func_name(func_name, sizeof(func_name), p); + rasm_func_begin(r, func_name, true); + + /** + * Set up vector register dimensions and reshape all vectors + * accordingly. + */ + size_t el_size = aarch64_pixel_size(p->type); + size_t total_size = p->block_size * el_size; + + s->vec_size = FFMIN(total_size, 16); + s->use_vh = (s->vec_size != total_size); + + s->el_size = el_size; + s->el_count = s->vec_size / el_size; + reshape_all_vectors(s, s->el_count, el_size); + + /* Common start for continuation-passing style (CPS) functions. */ + i_ldr(r, s->cont, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr cont = impl->cont;"); + + switch (p->op) { + case AARCH64_SWS_OP_READ_BIT: asmgen_op_read_bit(s, p); break; + case AARCH64_SWS_OP_READ_NIBBLE: asmgen_op_read_nibble(s, p); break; + case AARCH64_SWS_OP_READ_PACKED: asmgen_op_read_packed(s, p); break; + case AARCH64_SWS_OP_READ_PLANAR: asmgen_op_read_planar(s, p); break; + case AARCH64_SWS_OP_WRITE_BIT: asmgen_op_write_bit(s, p); break; + case AARCH64_SWS_OP_WRITE_NIBBLE: asmgen_op_write_nibble(s, p); break; + case AARCH64_SWS_OP_WRITE_PACKED: asmgen_op_write_packed(s, p); break; + case AARCH64_SWS_OP_WRITE_PLANAR: asmgen_op_write_planar(s, p); break; + case AARCH64_SWS_OP_SWAP_BYTES: asmgen_op_swap_bytes(s, p); break; + case AARCH64_SWS_OP_SWIZZLE: asmgen_op_swizzle(s, p); break; + case AARCH64_SWS_OP_UNPACK: asmgen_op_unpack(s, p); break; + case AARCH64_SWS_OP_PACK: asmgen_op_pack(s, p); break; + case AARCH64_SWS_OP_LSHIFT: asmgen_op_lshift(s, p); break; + case AARCH64_SWS_OP_RSHIFT: asmgen_op_rshift(s, p); break; + case AARCH64_SWS_OP_CLEAR: asmgen_op_clear(s, p); break; + case AARCH64_SWS_OP_CONVERT: asmgen_op_convert(s, p); break; + case AARCH64_SWS_OP_EXPAND: asmgen_op_expand(s, p); break; + case AARCH64_SWS_OP_MIN: asmgen_op_min(s, p); break; + case AARCH64_SWS_OP_MAX: asmgen_op_max(s, p); break; + case AARCH64_SWS_OP_SCALE: asmgen_op_scale(s, p); break; + case AARCH64_SWS_OP_LINEAR: asmgen_op_linear(s, p); break; + case AARCH64_SWS_OP_DITHER: asmgen_op_dither(s, p); break; + /* TODO implement AARCH64_SWS_OP_SHUFFLE */ + default: + break; + } + + /* Common end for CPS functions. */ + i_add(r, s->impl, s->impl, IMM(sizeof_impl)); CMT("impl += 1;"); + i_br (r, s->cont); CMT("jump to cont"); +} + +static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p) +{ + switch (p->op) { + case AARCH64_SWS_OP_PROCESS: + asmgen_process(s, p); + break; + case AARCH64_SWS_OP_PROCESS_RETURN: + asmgen_process_return(s, p); + break; + default: + asmgen_op_cps(s, p); + break; + } +} + +/*********************************************************************/ +static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params, + const SwsAArch64OpImplParams *prev, const char *p_str) +{ + int first_diff = 0; + int prev_levels = 0; + int levels = 0; + + /* Compute number of current levels. */ + if (params) { + const ParamField **fields = op_fields[params->op]; + while (fields[levels]) + levels++; + } + + /* Compute number of previous levels. */ + if (prev) { + const ParamField **prev_fields = op_fields[prev->op]; + while (prev_fields[prev_levels]) + prev_levels++; + } + + /* Walk up and check the conditions that match. */ + if (params && prev) { + const ParamField **fields = op_fields[params->op]; + first_diff = -1; + for (int i = 0; fields[i]; i++) { + const ParamField *field = fields[i]; + if (first_diff < 0) { + int diff = field->cmp_val((void *) (((uintptr_t) params) + field->offset), + (void *) (((uintptr_t) prev) + field->offset)); + if (diff) + first_diff = i; + } + } + } + + /* Walk back closing conditions. */ + if (prev) { + for (int i = prev_levels - 1; i > first_diff; i--) { + buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), ""); + buf_appendf(&buf, &size, "%*s}\n", 4 * i, ""); + } + } + + /* Walk up adding conditions to return current function. */ + if (params) { + const ParamField **fields = op_fields[params->op]; + for (int i = first_diff; i < levels; i++) { + const ParamField *field = fields[i]; + void *p = (void *) (((uintptr_t) params) + field->offset); + buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", p_str, field->name); + field->print_val(&buf, &size, p); + buf_appendf(&buf, &size, ")"); + if (i == (levels - 1)) { + buf_appendf(&buf, &size, " return "); + impl_func_name(&buf, &size, params); + buf_appendf(&buf, &size, ";\n"); + } else { + buf_appendf(&buf, &size, " {\n"); + } + } + } + + av_assert0(size && "string buffer exhausted"); +} + +static int lookup_gen(void) +{ + char buf[1024]; + + /** + * The lookup function matches the SwsAArch64OpImplParams from + * ops_entries.c to the exported functions generated by asmgen_op(). + * Each call to aarch64_op_impl_lookup_str() generates a code + * fragment to uniquely detect the current function, opening and/or + * closing conditions depending on the parameters of the previous + * function. + */ + + /* External function declarations. */ + printf("#include \"libswscale/aarch64/ops_lookup.h\"\n"); + printf("\n"); + for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) { + aarch64_op_impl_func_name(buf, sizeof(buf), p); + printf("extern void %s(void);\n", buf); + } + printf("\n"); + + /* Lookup function. */ + printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)\n"); + printf("{\n"); + const SwsAArch64OpImplParams *prev = NULL; + for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) { + aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->"); + printf("%s", buf); + prev = p; + } + aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->"); + printf("%s", buf); + printf(" return NULL;\n"); + printf("}\n"); + + return 0; +} + +/*********************************************************************/ + +/* Generate all functions described by ops_entries.c */ +static int asmgen(void) +{ + RasmContext *rctx = rasm_alloc(); + if (!rctx) + return AVERROR(ENOMEM); + + SwsAArch64Context s = { .rctx = rctx }; + int ret; + + /** + * The entry point of the SwsOpFunc is the `process` function. The + * kernel functions are chained by directly branching to the next + * operation, using a continuation-passing style design. The exit + * point of the SwsOpFunc is the `process_return` function. + * + * The GPRs used by the entire call-chain are listed below. + * + * Function arguments are passed in r0-r5. After the parameters + * from `exec` have been read, r0 is reused to branch to the + * continuation functions. After the original parameters from + * `impl` have been computed, r1 is reused as the `impl` pointer + * for each operation. + * + * Loop iterators are r6 for `bx` and r3 for `y`, reused from + * `y_start`, which doesn't need to be preserved. + * + * The intra-procedure-call temporary registers (r16 and r17) are + * used as scratch registers. They may be used by call veneers and + * PLT code inserted by the linker, so we cannot expect them to + * persist across branches between functions. + * + * The Platform Register (r18) is not used. + * + * The read/write data pointers and padding values first use up the + * remaining free caller-saved registers, and only then are the + * caller-saved registers (r19-r28) used. + */ + + /* SwsOpFunc arguments. */ + s.exec = a64op_gpx(0); // const SwsOpExec *exec + s.impl = a64op_gpx(1); // const void *priv + s.bx_start = a64op_gpw(2); // int bx_start + s.y_start = a64op_gpw(3); // int y_start + s.bx_end = a64op_gpw(4); // int bx_end + s.y_end = a64op_gpw(5); // int y_end + + /* Loop iterator variables. */ + s.bx = a64op_gpw(6); + s.y = s.y_start; /* Reused from SwsOpFunc argument. */ + + /* Scratch registers. */ + s.tmp0 = a64op_gpx(16); /* IP0 */ + s.tmp1 = a64op_gpx(17); /* IP1 */ + + /* CPS-related variables. */ + s.op0_func = a64op_gpx(7); + s.op1_impl = a64op_gpx(8); + s.cont = s.exec; /* Reused from SwsOpFunc argument. */ + + /* Read/Write data pointers and padding. */ + s.in [0] = a64op_gpx(9); + s.out [0] = a64op_gpx(10); + s.in_bump [0] = a64op_gpx(11); + s.out_bump[0] = a64op_gpx(12); + s.in [1] = a64op_gpx(13); + s.out [1] = a64op_gpx(14); + s.in_bump [1] = a64op_gpx(15); + s.out_bump[1] = a64op_gpx(19); + s.in [2] = a64op_gpx(20); + s.out [2] = a64op_gpx(21); + s.in_bump [2] = a64op_gpx(22); + s.out_bump[2] = a64op_gpx(23); + s.in [3] = a64op_gpx(24); + s.out [3] = a64op_gpx(25); + s.in_bump [3] = a64op_gpx(26); + s.out_bump[3] = a64op_gpx(27); + + /* Generate all functions from ops_entries.c using rasm. */ + const SwsAArch64OpImplParams *params = impl_params; + while (params->op) { + asmgen_op(&s, params++); + if (rctx->error) { + ret = rctx->error; + goto error; + } + } + + /* Print all rasm functions to stdout. */ + printf("#include \"libavutil/aarch64/asm.S\"\n"); + printf("\n"); + ret = rasm_print(s.rctx, stdout); + +error: + rasm_free(&s.rctx); + return ret; +} + +/*********************************************************************/ +int main(int argc, char *argv[]) +{ + bool lookup = false; + bool ops = false; + +#ifdef _WIN32 + _setmode(_fileno(stdout), _O_BINARY); +#endif + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-ops")) + ops = true; + else if (!strcmp(argv[i], "-lookup")) + lookup = true; + } + if ((lookup && ops) || (!lookup && !ops)) { + fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n"); + return -1; + } + + return lookup ? lookup_gen() : asmgen(); +} diff --git a/libswscale/aarch64/ops_impl.c b/libswscale/aarch64/ops_impl.c index 1e2f42ef14..f7e7b18dcf 100644 --- a/libswscale/aarch64/ops_impl.c +++ b/libswscale/aarch64/ops_impl.c @@ -18,6 +18,13 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/** + * This file is used both by sws_ops_aarch64 to generate ops_entries.c and + * by the standalone build-time tool that generates the static assembly + * functions (aarch64/ops_asmgen). Therefore, it must not depend on internal + * FFmpeg libraries. + */ + #include <inttypes.h> #include <stdarg.h> #include <stdbool.h> @@ -25,6 +32,10 @@ #include "libavutil/attributes.h" +/** + * NOTE: ops_asmgen contains header redefinitions to provide av_assert0 + * while not depending on internal FFmpeg libraries. + */ #include "libavutil/avassert.h" #include "ops_impl.h" @@ -46,6 +57,22 @@ static const char *aarch64_pixel_type(SwsAArch64PixelType fmt) return pixel_types[fmt]; } +static const char pixel_type_names[AARCH64_PIXEL_TYPE_NB][4] = { + [AARCH64_PIXEL_U8 ] = "u8", + [AARCH64_PIXEL_U16] = "u16", + [AARCH64_PIXEL_U32] = "u32", + [AARCH64_PIXEL_F32] = "f32", +}; + +static const char *aarch64_pixel_type_name(SwsAArch64PixelType fmt) +{ + if (fmt >= AARCH64_PIXEL_TYPE_NB) { + av_assert0(!"Invalid pixel type!"); + return NULL; + } + return pixel_type_names[fmt]; +} + /*********************************************************************/ static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = { [AARCH64_SWS_OP_NONE ] = "AARCH64_SWS_OP_NONE", @@ -84,6 +111,43 @@ static const char *aarch64_op_type(SwsAArch64OpType op) return op_types[op]; } +static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = { + [AARCH64_SWS_OP_NONE ] = "none", + [AARCH64_SWS_OP_PROCESS ] = "process", + [AARCH64_SWS_OP_PROCESS_RETURN] = "process_return", + [AARCH64_SWS_OP_READ_BIT ] = "read_bit", + [AARCH64_SWS_OP_READ_NIBBLE ] = "read_nibble", + [AARCH64_SWS_OP_READ_PACKED ] = "read_packed", + [AARCH64_SWS_OP_READ_PLANAR ] = "read_planar", + [AARCH64_SWS_OP_WRITE_BIT ] = "write_bit", + [AARCH64_SWS_OP_WRITE_NIBBLE ] = "write_nibble", + [AARCH64_SWS_OP_WRITE_PACKED ] = "write_packed", + [AARCH64_SWS_OP_WRITE_PLANAR ] = "write_planar", + [AARCH64_SWS_OP_SWAP_BYTES ] = "swap_bytes", + [AARCH64_SWS_OP_SWIZZLE ] = "swizzle", + [AARCH64_SWS_OP_UNPACK ] = "unpack", + [AARCH64_SWS_OP_PACK ] = "pack", + [AARCH64_SWS_OP_LSHIFT ] = "lshift", + [AARCH64_SWS_OP_RSHIFT ] = "rshift", + [AARCH64_SWS_OP_CLEAR ] = "clear", + [AARCH64_SWS_OP_CONVERT ] = "convert", + [AARCH64_SWS_OP_EXPAND ] = "expand", + [AARCH64_SWS_OP_MIN ] = "min", + [AARCH64_SWS_OP_MAX ] = "max", + [AARCH64_SWS_OP_SCALE ] = "scale", + [AARCH64_SWS_OP_LINEAR ] = "linear", + [AARCH64_SWS_OP_DITHER ] = "dither", +}; + +static const char *aarch64_op_type_name(SwsAArch64OpType op) +{ + if (op == AARCH64_SWS_OP_NONE || op >= AARCH64_SWS_OP_TYPE_NB) { + av_assert0(!"Invalid op type!"); + return NULL; + } + return op_type_names[op]; +} + /*********************************************************************/ /* * Helper string concatenation function that does not depend on the @@ -127,12 +191,19 @@ typedef struct ParamField { const char *name; size_t offset; size_t size; + void (*print_str)(char **pbuf, size_t *prem, void *p); void (*print_val)(char **pbuf, size_t *prem, void *p); int (*cmp_val)(void *pa, void *pb); } ParamField; #define PARAM_FIELD(name) #name, offsetof(SwsAArch64OpImplParams, name), sizeof(((SwsAArch64OpImplParams *) 0)->name) +static void print_op_name(char **pbuf, size_t *prem, void *p) +{ + SwsAArch64OpType op = *(SwsAArch64OpType *) p; + buf_appendf(pbuf, prem, "_%s", aarch64_op_type_name(op)); +} + static void print_op_val(char **pbuf, size_t *prem, void *p) { SwsAArch64OpType op = *(SwsAArch64OpType *) p; @@ -149,6 +220,12 @@ static int cmp_op(void *pa, void *pb) return 0; } +static void print_pixel_name(char **pbuf, size_t *prem, void *p) +{ + SwsAArch64PixelType type = *(SwsAArch64PixelType *) p; + buf_appendf(pbuf, prem, "_%s", aarch64_pixel_type_name(type)); +} + static void print_pixel_val(char **pbuf, size_t *prem, void *p) { SwsAArch64PixelType type = *(SwsAArch64PixelType *) p; @@ -165,6 +242,12 @@ static int cmp_pixel(void *pa, void *pb) return 0; } +static void print_u8_name(char **pbuf, size_t *prem, void *p) +{ + uint8_t val = *(uint8_t *) p; + buf_appendf(pbuf, prem, "_%u", val); +} + static void print_u8_val(char **pbuf, size_t *prem, void *p) { uint8_t val = *(uint8_t *) p; @@ -181,6 +264,12 @@ static int cmp_u8(void *pa, void *pb) return 0; } +static void print_u16_name(char **pbuf, size_t *prem, void *p) +{ + uint16_t val = *(uint16_t *) p; + buf_appendf(pbuf, prem, "_%04x", val); +} + static void print_u16_val(char **pbuf, size_t *prem, void *p) { uint16_t val = *(uint16_t *) p; @@ -197,6 +286,12 @@ static int cmp_u16(void *pa, void *pb) return 0; } +static void print_u40_name(char **pbuf, size_t *prem, void *p) +{ + uint64_t val = *(uint64_t *) p; + buf_appendf(pbuf, prem, "_%010" PRIx64, val); +} + static void print_u40_val(char **pbuf, size_t *prem, void *p) { uint64_t val = *(uint64_t *) p; @@ -214,18 +309,18 @@ static int cmp_u40(void *pa, void *pb) } /*********************************************************************/ -static const ParamField field_op = { PARAM_FIELD(op), print_op_val, cmp_op }; -static const ParamField field_mask = { PARAM_FIELD(mask), print_u16_val, cmp_u16 }; -static const ParamField field_type = { PARAM_FIELD(type), print_pixel_val, cmp_pixel }; -static const ParamField field_block_size = { PARAM_FIELD(block_size), print_u8_val, cmp_u8 }; -static const ParamField field_shift = { PARAM_FIELD(shift), print_u8_val, cmp_u8 }; -static const ParamField field_swizzle = { PARAM_FIELD(swizzle), print_u16_val, cmp_u16 }; -static const ParamField field_pack = { PARAM_FIELD(pack), print_u16_val, cmp_u16 }; -static const ParamField field_to_type = { PARAM_FIELD(to_type), print_pixel_val, cmp_pixel }; -static const ParamField field_linear_mask = { PARAM_FIELD(linear.mask), print_u40_val, cmp_u40 }; -static const ParamField field_linear_fmla = { PARAM_FIELD(linear.fmla), print_u8_val, cmp_u8 }; -static const ParamField field_dither_y_offset = { PARAM_FIELD(dither.y_offset), print_u16_val, cmp_u16 }; -static const ParamField field_dither_size_log2 = { PARAM_FIELD(dither.size_log2), print_u8_val, cmp_u8 }; +static const ParamField field_op = { PARAM_FIELD(op), print_op_name, print_op_val, cmp_op }; +static const ParamField field_mask = { PARAM_FIELD(mask), print_u16_name, print_u16_val, cmp_u16 }; +static const ParamField field_type = { PARAM_FIELD(type), print_pixel_name, print_pixel_val, cmp_pixel }; +static const ParamField field_block_size = { PARAM_FIELD(block_size), print_u8_name, print_u8_val, cmp_u8 }; +static const ParamField field_shift = { PARAM_FIELD(shift), print_u8_name, print_u8_val, cmp_u8 }; +static const ParamField field_swizzle = { PARAM_FIELD(swizzle), print_u16_name, print_u16_val, cmp_u16 }; +static const ParamField field_pack = { PARAM_FIELD(pack), print_u16_name, print_u16_val, cmp_u16 }; +static const ParamField field_to_type = { PARAM_FIELD(to_type), print_pixel_name, print_pixel_val, cmp_pixel }; +static const ParamField field_linear_mask = { PARAM_FIELD(linear.mask), print_u40_name, print_u40_val, cmp_u40 }; +static const ParamField field_linear_fmla = { PARAM_FIELD(linear.fmla), print_u8_name, print_u8_val, cmp_u8 }; +static const ParamField field_dither_y_offset = { PARAM_FIELD(dither.y_offset), print_u16_name, print_u16_val, cmp_u16 }; +static const ParamField field_dither_size_log2 = { PARAM_FIELD(dither.size_log2), print_u8_name, print_u8_val, cmp_u8 }; /* Fields needed to uniquely identify each SwsAArch64OpType. */ #define MAX_LEVELS 8 diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h index 7bd23dd8e8..d50c2b4b7d 100644 --- a/libswscale/aarch64/ops_impl.h +++ b/libswscale/aarch64/ops_impl.h @@ -108,8 +108,20 @@ typedef struct SwsAArch64OpImplParams { /* SwsAArch64OpMask-related helpers. */ +#define MASK_GET(mask, idx) (((mask) >> ((idx) << 2)) & 0xf) #define MASK_SET(mask, idx, val) do { (mask) |= (((val) & 0xf) << ((idx) << 2)); } while (0) +#define LOOP(mask, idx) \ + for (int idx = 0; idx < 4; idx++) \ + if (MASK_GET(mask, idx)) +#define LOOP_BWD(mask, idx) \ + for (int idx = 3; idx >= 0; idx--) \ + if (MASK_GET(mask, idx)) + +#define LOOP_MASK(p, idx) LOOP(p->mask, idx) +#define LOOP_MASK_BWD(p, idx) LOOP_BWD(p->mask, idx) + +#define LINEAR_MASK_GET(mask, idx, jdx) (((mask) >> (2 * ((5 * (idx) + (jdx))))) & 3) #define LINEAR_MASK_SET(mask, idx, jdx, val) do { \ (mask) |= ((((SwsAArch64LinearOpMask) (val)) & 3) << (2 * ((5 * (idx) + (jdx))))); \ } while (0) @@ -117,6 +129,34 @@ typedef struct SwsAArch64OpImplParams { #define LINEAR_MASK_1 1 #define LINEAR_MASK_X 3 +#define LOOP_LINEAR_MASK(p, idx, jdx) \ + LOOP_MASK(p, idx) \ + for (int jdx = 0; jdx < 5; jdx++) \ + if (LINEAR_MASK_GET(p->linear.mask, idx, jdx)) + +/* Compute number of vector registers needed to store all coefficients. */ +static inline int linear_num_vregs(const SwsAArch64OpImplParams *params) +{ + int count = 0; + LOOP_LINEAR_MASK(params, i, j) + count++; + return (count + 3) / 4; +} + +static inline int linear_index_is_offset(int idx) +{ + return (idx == 0); +} + +static inline int linear_index_to_vx(int idx) +{ + /* The offset shouldn't map to any vx, but to please UBSan we map + * it to 0. */ + if (linear_index_is_offset(idx)) + return 0; + return (idx - 1); +} + /** * These values will be used by ops_asmgen to access fields inside of * SwsOpExec and SwsOpImpl. The sizes are checked below when compiling _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
