ops: refactor process function (PR #23412)

Ramiro Polla via ffmpeg-devel Mon, 08 Jun 2026 12:28:29 -0700

PR #23412 opened by Ramiro Polla (ramiro)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23412
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23412.patch


Similarly to c29465bcb6 for x86, use plain call/ret pair instead of awkwardly 
exporting and then jumping back to the return label.

Then also remove `AARCH64_SWS_OP_PROCESS` from `SwsAArch64OpType`. There was no 
good reason to have it there.


>From 7c49e70850db969bb75aedd5d41828540c9ec249 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <[email protected]>
Date: Wed, 3 Jun 2026 19:40:06 +0200
Subject: [PATCH 1/5] swscale/tests/sws_ops_aarch64: fix skipping of scaling
 ops

Scaling ops were add to ff_sws_enum_op_lists() in 1d841635. But the
code that skipped scaling ops in convert_to_aarch64_impl() wasn't
taking into consideration that, in sws_ops_aarch64, the scaling ops
aren't folded into read ops.

Also updates libswscale/aarch64/ops_entries.c with the new entries.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/ops_entries.c   | 7 +++++++
 libswscale/aarch64/ops_impl_conv.c | 3 +++
 libswscale/tests/sws_ops_aarch64.c | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/libswscale/aarch64/ops_entries.c b/libswscale/aarch64/ops_entries.c
index 61ff8bf760..70aad8ae89 100644
--- a/libswscale/aarch64/ops_entries.c
+++ b/libswscale/aarch64/ops_entries.c
@@ -116,6 +116,7 @@
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0213, .block_size = 16, .type = 
AARCH64_PIXEL_U8, .mask = 0x1001 },
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0213, .block_size = 32, .type = 
AARCH64_PIXEL_U8, .mask = 0x1001 },
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0231, .block_size = 8, .type = 
AARCH64_PIXEL_U8, .mask = 0x1011 },
+{ .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0312, .block_size = 8, .type = 
AARCH64_PIXEL_U8, .mask = 0x1101 },
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0312, .block_size = 16, .type = 
AARCH64_PIXEL_U8, .mask = 0x1101 },
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0312, .block_size = 32, .type = 
AARCH64_PIXEL_U8, .mask = 0x1101 },
 { .op = AARCH64_SWS_OP_SWIZZLE, .swizzle = 0x0321, .block_size = 8, .type = 
AARCH64_PIXEL_U8, .mask = 0x1111 },
@@ -254,6 +255,7 @@
 { .op = AARCH64_SWS_OP_CLEAR, .block_size = 16, .type = AARCH64_PIXEL_U16, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_CLEAR, .block_size = 16, .type = AARCH64_PIXEL_U16, 
.mask = 0x0010 },
 { .op = AARCH64_SWS_OP_CLEAR, .block_size = 16, .type = AARCH64_PIXEL_U16, 
.mask = 0x1000 },
+{ .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 8, 
.type = AARCH64_PIXEL_U16, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 8, 
.type = AARCH64_PIXEL_F32, .mask = 0x0001 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 8, 
.type = AARCH64_PIXEL_F32, .mask = 0x0011 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 8, 
.type = AARCH64_PIXEL_F32, .mask = 0x0111 },
@@ -262,6 +264,8 @@
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 8, 
.type = AARCH64_PIXEL_F32, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U8, .block_size = 16, 
.type = AARCH64_PIXEL_U16, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_U8, .mask = 0x0111 },
+{ .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_U32, .mask = 0x0010 },
+{ .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_U32, .mask = 0x0100 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_U32, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_U32, .mask = 0x1110 },
 { .op = AARCH64_SWS_OP_CONVERT, .to_type = AARCH64_PIXEL_U16, .block_size = 8, 
.type = AARCH64_PIXEL_F32, .mask = 0x0001 },
@@ -316,6 +320,7 @@
 { .op = AARCH64_SWS_OP_MAX, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask 
= 0x0011 },
 { .op = AARCH64_SWS_OP_MAX, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask 
= 0x0111 },
 { .op = AARCH64_SWS_OP_MAX, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask 
= 0x1001 },
+{ .op = AARCH64_SWS_OP_MAX, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask 
= 0x1110 },
 { .op = AARCH64_SWS_OP_MAX, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask 
= 0x1111 },
 { .op = AARCH64_SWS_OP_SCALE, .block_size = 8, .type = AARCH64_PIXEL_U32, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_SCALE, .block_size = 8, .type = AARCH64_PIXEL_U32, 
.mask = 0x0111 },
@@ -375,9 +380,11 @@
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x3ff0, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1001 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5023, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5032, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1111 },
+{ .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5203, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5230, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5ff0, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1001 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0x5fff, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x1000 },
+{ .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0xf000, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0xf023, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0xf032, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_DITHER, .dither.y_offset = 0xf203, .dither.size_log2 = 
4, .block_size = 8, .type = AARCH64_PIXEL_F32, .mask = 0x0111 },
diff --git a/libswscale/aarch64/ops_impl_conv.c 
b/libswscale/aarch64/ops_impl_conv.c
index 48504dc671..a66b91b6fb 100644
--- a/libswscale/aarch64/ops_impl_conv.c
+++ b/libswscale/aarch64/ops_impl_conv.c
@@ -124,6 +124,9 @@ static int convert_to_aarch64_impl(SwsContext *ctx, const 
SwsOpList *ops, int n,
     case SWS_OP_SCALE:      out->op = AARCH64_SWS_OP_SCALE;      break;
     case SWS_OP_LINEAR:     out->op = AARCH64_SWS_OP_LINEAR;     break;
     case SWS_OP_DITHER:     out->op = AARCH64_SWS_OP_DITHER;     break;
+    case SWS_OP_FILTER_H:
+    case SWS_OP_FILTER_V:
+        return AVERROR(ENOTSUP);
     }
 
     switch (out->op) {
diff --git a/libswscale/tests/sws_ops_aarch64.c 
b/libswscale/tests/sws_ops_aarch64.c
index 21948ca71b..ca6279e8cf 100644
--- a/libswscale/tests/sws_ops_aarch64.c
+++ b/libswscale/tests/sws_ops_aarch64.c
@@ -118,6 +118,8 @@ static int register_op(SwsContext *ctx, void *opaque, 
SwsOpList *ops)
     for (int i = 0; i < rest.num_ops; i++) {
         SwsAArch64OpImplParams params = { 0 };
         ret = convert_to_aarch64_impl(ctx, &rest, i, block_size, &params);
+        if (ret == AVERROR(ENOTSUP))
+            continue;
         if (ret < 0)
             goto end;
         ret = aarch64_collect_op(&params, root);
-- 
2.52.0


>From b6c92ed01e9b1272724ca915513273a1bded0f8a Mon Sep 17 00:00:00 2001
From: Ramiro Polla <[email protected]>
Date: Mon, 13 Apr 2026 15:14:29 +0200
Subject: [PATCH 2/5] swscale/aarch64/rasm: split conditional and unconditional
 branch instructions

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/rasm.h       | 36 +++++++++++++++++----------------
 libswscale/aarch64/rasm_print.c |  3 ++-
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/libswscale/aarch64/rasm.h b/libswscale/aarch64/rasm.h
index 5a14d8cd64..a91fc3f291 100644
--- a/libswscale/aarch64/rasm.h
+++ b/libswscale/aarch64/rasm.h
@@ -248,6 +248,7 @@ typedef enum AArch64InsnId {
     AARCH64_INSN_ADR,
     AARCH64_INSN_AND,
     AARCH64_INSN_B,
+    AARCH64_INSN_BCOND,
     AARCH64_INSN_BR,
     AARCH64_INSN_CMP,
     AARCH64_INSN_CSEL,
@@ -537,7 +538,8 @@ static inline RasmOp a64cond_nv(void) { return 
a64op_cond(AARCH64_COND_NV); }
 #define i_addv(rctx,   op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_ADDV,   op0, op1, OPN, OPN)
 #define i_adr(rctx,    op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_ADR,    op0, op1, OPN, OPN)
 #define i_and(rctx,    op0, op1, op2     ) rasm_add_insn(rctx, 
AARCH64_INSN_AND,    op0, op1, op2, OPN)
-#define i_b(rctx,      op0, op1          ) rasm_add_insn(rctx, AARCH64_INSN_B, 
     op0, op1, OPN, OPN)
+#define i_b(rctx,      op0               ) rasm_add_insn(rctx, AARCH64_INSN_B, 
     op0, OPN, OPN, OPN)
+#define i_bcond(rctx,  op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_BCOND,  op0, op1, OPN, OPN)
 #define i_br(rctx,     op0               ) rasm_add_insn(rctx, 
AARCH64_INSN_BR,     op0, OPN, OPN, OPN)
 #define i_cmp(rctx,    op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_CMP,    op0, op1, OPN, OPN)
 #define i_csel(rctx,   op0, op1, op2, op3) rasm_add_insn(rctx, 
AARCH64_INSN_CSEL,   op0, op1, op2, op3)
@@ -592,22 +594,22 @@ static inline RasmOp a64cond_nv(void) { return 
a64op_cond(AARCH64_COND_NV); }
 #define i_zip2(rctx,   op0, op1, op2     ) rasm_add_insn(rctx, 
AARCH64_INSN_ZIP2,   op0, op1, op2, OPN)
 
 /* Branch helpers. */
-#define i_beq(rctx, id) i_b(rctx, a64cond_eq(), rasm_op_label(id))
-#define i_bne(rctx, id) i_b(rctx, a64cond_ne(), rasm_op_label(id))
-#define i_bhs(rctx, id) i_b(rctx, a64cond_hs(), rasm_op_label(id))
-#define i_bcs(rctx, id) i_b(rctx, a64cond_cs(), rasm_op_label(id))
-#define i_blo(rctx, id) i_b(rctx, a64cond_lo(), rasm_op_label(id))
-#define i_bcc(rctx, id) i_b(rctx, a64cond_cc(), rasm_op_label(id))
-#define i_bmi(rctx, id) i_b(rctx, a64cond_mi(), rasm_op_label(id))
-#define i_bpl(rctx, id) i_b(rctx, a64cond_pl(), rasm_op_label(id))
-#define i_bvs(rctx, id) i_b(rctx, a64cond_vs(), rasm_op_label(id))
-#define i_bvc(rctx, id) i_b(rctx, a64cond_vc(), rasm_op_label(id))
-#define i_bhi(rctx, id) i_b(rctx, a64cond_hi(), rasm_op_label(id))
-#define i_bls(rctx, id) i_b(rctx, a64cond_ls(), rasm_op_label(id))
-#define i_bge(rctx, id) i_b(rctx, a64cond_ge(), rasm_op_label(id))
-#define i_blt(rctx, id) i_b(rctx, a64cond_lt(), rasm_op_label(id))
-#define i_bgt(rctx, id) i_b(rctx, a64cond_gt(), rasm_op_label(id))
-#define i_ble(rctx, id) i_b(rctx, a64cond_le(), rasm_op_label(id))
+#define i_beq(rctx, id) i_bcond(rctx, a64cond_eq(), rasm_op_label(id))
+#define i_bne(rctx, id) i_bcond(rctx, a64cond_ne(), rasm_op_label(id))
+#define i_bhs(rctx, id) i_bcond(rctx, a64cond_hs(), rasm_op_label(id))
+#define i_bcs(rctx, id) i_bcond(rctx, a64cond_cs(), rasm_op_label(id))
+#define i_blo(rctx, id) i_bcond(rctx, a64cond_lo(), rasm_op_label(id))
+#define i_bcc(rctx, id) i_bcond(rctx, a64cond_cc(), rasm_op_label(id))
+#define i_bmi(rctx, id) i_bcond(rctx, a64cond_mi(), rasm_op_label(id))
+#define i_bpl(rctx, id) i_bcond(rctx, a64cond_pl(), rasm_op_label(id))
+#define i_bvs(rctx, id) i_bcond(rctx, a64cond_vs(), rasm_op_label(id))
+#define i_bvc(rctx, id) i_bcond(rctx, a64cond_vc(), rasm_op_label(id))
+#define i_bhi(rctx, id) i_bcond(rctx, a64cond_hi(), rasm_op_label(id))
+#define i_bls(rctx, id) i_bcond(rctx, a64cond_ls(), rasm_op_label(id))
+#define i_bge(rctx, id) i_bcond(rctx, a64cond_ge(), rasm_op_label(id))
+#define i_blt(rctx, id) i_bcond(rctx, a64cond_lt(), rasm_op_label(id))
+#define i_bgt(rctx, id) i_bcond(rctx, a64cond_gt(), rasm_op_label(id))
+#define i_ble(rctx, id) i_bcond(rctx, a64cond_le(), rasm_op_label(id))
 
 /* Extra helpers. */
 #define i_mov16b(rctx, op0, op1) i_mov(rctx, v_16b(op0), v_16b(op1))
diff --git a/libswscale/aarch64/rasm_print.c b/libswscale/aarch64/rasm_print.c
index 86f543b3c9..8f55d87401 100644
--- a/libswscale/aarch64/rasm_print.c
+++ b/libswscale/aarch64/rasm_print.c
@@ -271,6 +271,7 @@ static const char insn_names[AARCH64_INSN_NB][8] = {
     [AARCH64_INSN_ADR   ] = "adr",
     [AARCH64_INSN_AND   ] = "and",
     [AARCH64_INSN_B     ] = "b",
+    [AARCH64_INSN_BCOND ] = "b",
     [AARCH64_INSN_BR    ] = "br",
     [AARCH64_INSN_CMP   ] = "cmp",
     [AARCH64_INSN_CSEL  ] = "csel",
@@ -342,7 +343,7 @@ static void print_node_insn(const RasmContext *rctx,
     indent_to(fp, pos, line_start, INSTR_INDENT);
 
     int op_start = 0;
-    if (node->insn.id == AARCH64_INSN_B && rasm_op_type(node->insn.op[0]) == 
AARCH64_OP_COND) {
+    if (node->insn.id == AARCH64_INSN_BCOND) {
         pos_fprintf(fp, pos, "b.%-14s", 
cond_name(a64op_cond_val(node->insn.op[0])));
         op_start = 1;
     } else if (rasm_op_type(node->insn.op[0]) == RASM_OP_NONE) {
-- 
2.52.0


>From 9186784764da793be932b1b16548ee5871024831 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <[email protected]>
Date: Mon, 13 Apr 2026 15:16:35 +0200
Subject: [PATCH 3/5] swscale/aarch64/rasm: add blr instruction

And a64op_lr() helper for LR register.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/rasm.h       | 3 +++
 libswscale/aarch64/rasm_print.c | 1 +
 2 files changed, 4 insertions(+)

diff --git a/libswscale/aarch64/rasm.h b/libswscale/aarch64/rasm.h
index a91fc3f291..2ced8d0e95 100644
--- a/libswscale/aarch64/rasm.h
+++ b/libswscale/aarch64/rasm.h
@@ -249,6 +249,7 @@ typedef enum AArch64InsnId {
     AARCH64_INSN_AND,
     AARCH64_INSN_B,
     AARCH64_INSN_BCOND,
+    AARCH64_INSN_BLR,
     AARCH64_INSN_BR,
     AARCH64_INSN_CMP,
     AARCH64_INSN_CSEL,
@@ -351,6 +352,7 @@ static inline uint8_t a64op_gpr_size(RasmOp op) { return 
op.u8[1]; }
 
 static inline RasmOp a64op_gpw(uint8_t n) { return a64op_make_gpr(n, 
sizeof(uint32_t)); }
 static inline RasmOp a64op_gpx(uint8_t n) { return a64op_make_gpr(n, 
sizeof(uint64_t)); }
+static inline RasmOp a64op_lr (void)      { return a64op_make_gpr(30, 
sizeof(uint64_t)); }
 static inline RasmOp a64op_sp (void)      { return a64op_make_gpr(31, 
sizeof(uint64_t)); }
 
 /* modifiers */
@@ -540,6 +542,7 @@ static inline RasmOp a64cond_nv(void) { return 
a64op_cond(AARCH64_COND_NV); }
 #define i_and(rctx,    op0, op1, op2     ) rasm_add_insn(rctx, 
AARCH64_INSN_AND,    op0, op1, op2, OPN)
 #define i_b(rctx,      op0               ) rasm_add_insn(rctx, AARCH64_INSN_B, 
     op0, OPN, OPN, OPN)
 #define i_bcond(rctx,  op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_BCOND,  op0, op1, OPN, OPN)
+#define i_blr(rctx,    op0               ) rasm_add_insn(rctx, 
AARCH64_INSN_BLR,    op0, OPN, OPN, OPN)
 #define i_br(rctx,     op0               ) rasm_add_insn(rctx, 
AARCH64_INSN_BR,     op0, OPN, OPN, OPN)
 #define i_cmp(rctx,    op0, op1          ) rasm_add_insn(rctx, 
AARCH64_INSN_CMP,    op0, op1, OPN, OPN)
 #define i_csel(rctx,   op0, op1, op2, op3) rasm_add_insn(rctx, 
AARCH64_INSN_CSEL,   op0, op1, op2, op3)
diff --git a/libswscale/aarch64/rasm_print.c b/libswscale/aarch64/rasm_print.c
index 8f55d87401..ff870f8a27 100644
--- a/libswscale/aarch64/rasm_print.c
+++ b/libswscale/aarch64/rasm_print.c
@@ -272,6 +272,7 @@ static const char insn_names[AARCH64_INSN_NB][8] = {
     [AARCH64_INSN_AND   ] = "and",
     [AARCH64_INSN_B     ] = "b",
     [AARCH64_INSN_BCOND ] = "b",
+    [AARCH64_INSN_BLR   ] = "blr",
     [AARCH64_INSN_BR    ] = "br",
     [AARCH64_INSN_CMP   ] = "cmp",
     [AARCH64_INSN_CSEL  ] = "csel",
-- 
2.52.0


>From 09f687d03db4f81de773980f2d663b3a3c3117a0 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <[email protected]>
Date: Mon, 13 Apr 2026 15:28:32 +0200
Subject: [PATCH 4/5] swscale/aarch64/ops: use plain `ret` instruction

Use a call/ret pair instead of awkwardly exporting and then jumping
back to the return label.

This is similar to c29465bcb6, but for aarch64.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/ops.c           |  12 +--
 libswscale/aarch64/ops_asmgen.c    | 124 ++++++++++++++++-------------
 libswscale/aarch64/ops_entries.c   |   4 -
 libswscale/aarch64/ops_impl.c      |   3 -
 libswscale/aarch64/ops_impl.h      |   1 -
 libswscale/tests/sws_ops_aarch64.c |   7 +-
 6 files changed, 73 insertions(+), 78 deletions(-)

diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c
index 4598a8db6b..c9d0ef58f1 100644
--- a/libswscale/aarch64/ops.c
+++ b/libswscale/aarch64/ops.c
@@ -220,7 +220,7 @@ static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, 
SwsCompiledOp *out)
             goto error;
     }
 
-    /* Look up process/process_return functions. */
+    /* Look up process function. */
     const SwsOp *read  = ff_sws_op_list_input(&rest);
     const SwsOp *write = ff_sws_op_list_output(&rest);
     const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
@@ -229,19 +229,13 @@ static int aarch64_compile(SwsContext *ctx, SwsOpList 
*ops, SwsCompiledOp *out)
     for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
         MASK_SET(mask, i, 1);
 
-    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS,    
    .mask = mask };
-    SwsAArch64OpImplParams return_params  = { .op = 
AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
+    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, 
.mask = mask };
     SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
-    SwsFuncPtr return_func  = ff_sws_aarch64_lookup(&return_params);
-    if (!process_func || !return_func) {
+    if (!process_func) {
         ret = AVERROR(ENOTSUP);
         goto error;
     }
 
-    ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
-    if (ret < 0)
-        goto error;
-
     out->func      = (SwsOpFunc) process_func;
     out->cpu_flags = chain->cpu_flags;
 
diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c
index e88a162de1..a1e379967d 100644
--- a/libswscale/aarch64/ops_asmgen.c
+++ b/libswscale/aarch64/ops_asmgen.c
@@ -260,14 +260,14 @@ static void asmgen_epilogue(SwsAArch64Context *s, const 
RasmOp *regs, unsigned n
 }
 
 /*********************************************************************/
-/* Callee-saved registers (r19-r28). */
-#define MAX_SAVED_REGS 10
+/* Callee-saved registers (r19-r28, fp, and lr). */
+#define MAX_SAVED_REGS 12
 
 static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
                         RasmOp gpr)
 {
     const int n = a64op_gpr_n(gpr);
-    if (n >= 19 && n <= 28)
+    if (n >= 19 && n <= 30)
         regs[(*count)++] = gpr;
 }
 
@@ -276,6 +276,7 @@ static unsigned clobbered_gprs(const SwsAArch64Context *s,
                                RasmOp regs[MAX_SAVED_REGS])
 {
     unsigned count = 0;
+    clobber_gpr(regs, &count, a64op_lr());
     LOOP_MASK(p, i) {
         clobber_gpr(regs, &count, s->in[i]);
         clobber_gpr(regs, &count, s->out[i]);
@@ -292,9 +293,8 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
     char buf[64];
 
     /**
-     * The process/process_return functions for aarch64 work similarly
-     * to the x86 backend. The description in x86/ops_common.asm mostly
-     * holds as well here.
+     * The process function for aarch64 works similarly to the x86 backend.
+     * The description in x86/ops_common.asm mostly holds as well here.
      */
 
     aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
@@ -329,49 +329,38 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
         i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + 
(i * sizeof(ptrdiff_t))));
     }
 
-    /* Reset x and jump to first kernel. */
-    i_mov(r, s->bx, s->bx_start);   CMT("bx = bx_start;");
-    i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
-    i_br (r, s->op0_func);          CMT("jump to op0_func");
-}
+    int first_row  = rasm_new_label(r, NULL);
+    int next_row   = rasm_new_label(r, NULL);
+    int next_block = rasm_new_label(r, NULL);
 
-static void asmgen_process_return(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
-{
-    RasmContext *r = s->rctx;
-    char func_name[128];
+    /* Jump to first row (skips padding). */
+    i_b  (r, rasm_op_label(first_row));     CMT("goto first_row;");
 
-    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
-
-    rasm_func_begin(r, func_name, true, true);
-
-    /* Reset impl to first kernel. */
-    i_mov(r, s->impl, s->op1_impl);         CMT("impl = op1_impl;");
-
-    /* Perform horizontal loop. */
-    int loop = rasm_new_label(r, NULL);
-    i_add(r, s->bx, s->bx, IMM(1));         CMT("bx += 1;");
-    i_cmp(r, s->bx, s->bx_end);             CMT("if (bx != bx_end)");
-    i_bne(r, loop);                         CMT("    goto loop;");
-
-    /* Perform vertical loop. */
-    int end = rasm_new_label(r, NULL);
-    i_add(r, s->y, s->y, IMM(1));           CMT("y += 1;");
-    i_cmp(r, s->y, s->y_end);               CMT("if (y == y_end)");
-    i_beq(r, end);                          CMT("    goto end;");
-
-    /* Perform padding and reset x, preparing for next row. */
+    /* Perform padding, preparing for next row. */
+    rasm_add_label(r, next_row);            CMT("next_row:");
     LOOP_MASK(p, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
     LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
+
+    /* First row (reset x). */
+    rasm_add_label(r, first_row);           CMT("first_row:");
     i_mov(r, s->bx, s->bx_start);           CMT("bx = bx_start;");
 
-    /* Loop back or end of function. */
-    rasm_add_label(r, loop);                CMT("loop:");
-    i_br (r, s->op0_func);                  CMT("jump to op0_func");
-    rasm_add_label(r, end);                 CMT("end:");
+    /* Reset impl and call first kernel. */
+    rasm_add_label(r, next_block);          CMT("next_block:");
+    i_mov(r, s->impl, s->op1_impl);         CMT("impl = op1_impl;");
+    i_blr(r, s->op0_func);                  CMT("op0_func();");
+
+    /* Perform horizontal loop. */
+    i_add(r, s->bx, s->bx, IMM(1));         CMT("bx += 1;");
+    i_cmp(r, s->bx, s->bx_end);             CMT("if (bx != bx_end)");
+    i_bne(r, next_block);                   CMT("    goto next_block;");
+
+    /* Perform vertical loop. */
+    i_add(r, s->y, s->y, IMM(1));           CMT("y += 1;");
+    i_cmp(r, s->y, s->y_end);               CMT("if (y != y_end)");
+    i_bne(r, next_row);                     CMT("    goto next_row;");
 
     /* Function epilogue */
-    RasmOp saved_regs[MAX_SAVED_REGS];
-    unsigned nsaved = clobbered_gprs(s, p, saved_regs);
     if (nsaved)
         asmgen_epilogue(s, saved_regs, nsaved);
 
@@ -1367,9 +1356,28 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
 {
     RasmContext *r = s->rctx;
 
+    bool is_read = false;
+    bool is_write = false;
+    switch (p->op) {
+    case AARCH64_SWS_OP_READ_BIT:
+    case AARCH64_SWS_OP_READ_NIBBLE:
+    case AARCH64_SWS_OP_READ_PACKED:
+    case AARCH64_SWS_OP_READ_PLANAR:
+        is_read = true;
+        break;
+    case AARCH64_SWS_OP_WRITE_BIT:
+    case AARCH64_SWS_OP_WRITE_NIBBLE:
+    case AARCH64_SWS_OP_WRITE_PACKED:
+    case AARCH64_SWS_OP_WRITE_PLANAR:
+        is_write = true;
+        break;
+    default:
+        break;
+    }
+
     char func_name[128];
     aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
-    rasm_func_begin(r, func_name, true, true);
+    rasm_func_begin(r, func_name, true, !is_read);
 
     /**
      * Set up vector register dimensions and reshape all vectors
@@ -1416,14 +1424,18 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
         break;
     }
 
-    /* Load continuation address and increment impl pointer. */
-    RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
-    RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
-    i_ldr(r, s->cont, impl_post);                   CMT("SwsFuncPtr cont = 
(impl++)->cont;");
-    rasm_set_current_node(r, node);
-
-    /* Common end for CPS functions. */
-    i_br (r, s->cont);                              CMT("jump to cont");
+    if (is_write) {
+        /* Write functions return directly. */
+        i_ret(r);
+    } else {
+        /* Load continuation address and increment impl pointer. */
+        RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
+        RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
+        i_ldr(r, s->cont, impl_post);                   CMT("SwsFuncPtr cont = 
(impl++)->cont;");
+        rasm_set_current_node(r, node);
+        /* Common end for remaining CPS functions. */
+        i_br (r, s->cont);                              CMT("jump to cont");
+    }
 }
 
 static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
@@ -1432,9 +1444,6 @@ static void asmgen_op(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     case AARCH64_SWS_OP_PROCESS:
         asmgen_process(s, p);
         break;
-    case AARCH64_SWS_OP_PROCESS_RETURN:
-        asmgen_process_return(s, p);
-        break;
     default:
         asmgen_op_cps(s, p);
         break;
@@ -1561,9 +1570,11 @@ static int asmgen(void)
 
     /**
      * The entry point of the SwsOpFunc is the `process` function. The
+     * first kernel function is called from `process`, and subsequent
      * kernel functions are chained by directly branching to the next
-     * operation, using a continuation-passing style design. The exit
-     * point of the SwsOpFunc is the `process_return` function.
+     * operation, using a continuation-passing style design. The last
+     * operation must be a write operation, which returns from the call
+     * to the `process` function.
      *
      * The GPRs used by the entire call-chain are listed below.
      *
@@ -1586,6 +1597,9 @@ static int asmgen(void)
      * The read/write data pointers and padding values first use up the
      * remaining free caller-saved registers, and only then are the
      * caller-saved registers (r19-r28) used.
+     *
+     * The Link Register (r30) is used when calling the first kernel,
+     * so it must be saved.
      */
 
     /* SwsOpFunc arguments. */
diff --git a/libswscale/aarch64/ops_entries.c b/libswscale/aarch64/ops_entries.c
index 70aad8ae89..ae30ca8b57 100644
--- a/libswscale/aarch64/ops_entries.c
+++ b/libswscale/aarch64/ops_entries.c
@@ -7,10 +7,6 @@
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0011 },
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x1111 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0001 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0011 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0111 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 16, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_NIBBLE, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
diff --git a/libswscale/aarch64/ops_impl.c b/libswscale/aarch64/ops_impl.c
index f7e7b18dcf..26d6a8d954 100644
--- a/libswscale/aarch64/ops_impl.c
+++ b/libswscale/aarch64/ops_impl.c
@@ -77,7 +77,6 @@ static const char 
*aarch64_pixel_type_name(SwsAArch64PixelType fmt)
 static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
     [AARCH64_SWS_OP_NONE          ] = "AARCH64_SWS_OP_NONE",
     [AARCH64_SWS_OP_PROCESS       ] = "AARCH64_SWS_OP_PROCESS",
-    [AARCH64_SWS_OP_PROCESS_RETURN] = "AARCH64_SWS_OP_PROCESS_RETURN",
     [AARCH64_SWS_OP_READ_BIT      ] = "AARCH64_SWS_OP_READ_BIT",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "AARCH64_SWS_OP_READ_NIBBLE",
     [AARCH64_SWS_OP_READ_PACKED   ] = "AARCH64_SWS_OP_READ_PACKED",
@@ -114,7 +113,6 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
 static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
     [AARCH64_SWS_OP_NONE          ] = "none",
     [AARCH64_SWS_OP_PROCESS       ] = "process",
-    [AARCH64_SWS_OP_PROCESS_RETURN] = "process_return",
     [AARCH64_SWS_OP_READ_BIT      ] = "read_bit",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "read_nibble",
     [AARCH64_SWS_OP_READ_PACKED   ] = "read_packed",
@@ -326,7 +324,6 @@ static const ParamField field_dither_size_log2 = { 
PARAM_FIELD(dither.size_log2)
 #define MAX_LEVELS 8
 static const ParamField *op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS] = {
     [AARCH64_SWS_OP_PROCESS       ] = { &field_op,                             
                                                     &field_mask },
-    [AARCH64_SWS_OP_PROCESS_RETURN] = { &field_op,                             
                                                     &field_mask },
     [AARCH64_SWS_OP_READ_BIT      ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_NIBBLE   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_PACKED   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h
index 67c4672812..f0bbc9f697 100644
--- a/libswscale/aarch64/ops_impl.h
+++ b/libswscale/aarch64/ops_impl.h
@@ -38,7 +38,6 @@ typedef enum SwsAArch64PixelType {
 typedef enum SwsAArch64OpType {
     AARCH64_SWS_OP_NONE = 0,
     AARCH64_SWS_OP_PROCESS,
-    AARCH64_SWS_OP_PROCESS_RETURN,
     AARCH64_SWS_OP_READ_BIT,
     AARCH64_SWS_OP_READ_NIBBLE,
     AARCH64_SWS_OP_READ_PACKED,
diff --git a/libswscale/tests/sws_ops_aarch64.c 
b/libswscale/tests/sws_ops_aarch64.c
index ca6279e8cf..84300c6af4 100644
--- a/libswscale/tests/sws_ops_aarch64.c
+++ b/libswscale/tests/sws_ops_aarch64.c
@@ -72,7 +72,7 @@ error:
     return ret;
 }
 
-/* Collect the parameters for the process/process_return functions. */
+/* Collect the parameters for the process function. */
 static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode 
**root)
 {
     const SwsOp *read  = ff_sws_op_list_input(ops);
@@ -89,11 +89,6 @@ static int aarch64_collect_process(const SwsOpList *ops, 
struct AVTreeNode **roo
         .mask = mask,
     };
 
-    ret = aarch64_collect_op(&params, root);
-    if (ret < 0)
-        return ret;
-
-    params.op = AARCH64_SWS_OP_PROCESS_RETURN;
     ret = aarch64_collect_op(&params, root);
     if (ret < 0)
         return ret;
-- 
2.52.0


>From 1f87adf7b90591f8817ce8be17896ce8fdd4709e Mon Sep 17 00:00:00 2001
From: Ramiro Polla <[email protected]>
Date: Mon, 8 Jun 2026 21:10:03 +0200
Subject: [PATCH 5/5] swscale/aarch64/ops: simplify process function generation

There was no good reason to have it as an SwsAArch64OpType.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/ops.c           | 22 +++++++++-------
 libswscale/aarch64/ops_asmgen.c    | 42 +++++++++++++-----------------
 libswscale/aarch64/ops_entries.c   |  4 ---
 libswscale/aarch64/ops_impl.c      |  3 ---
 libswscale/aarch64/ops_impl.h      |  1 -
 libswscale/tests/sws_ops_aarch64.c | 28 --------------------
 6 files changed, 30 insertions(+), 70 deletions(-)

diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c
index c9d0ef58f1..366f23efdf 100644
--- a/libswscale/aarch64/ops.c
+++ b/libswscale/aarch64/ops.c
@@ -221,22 +221,24 @@ static int aarch64_compile(SwsContext *ctx, SwsOpList 
*ops, SwsCompiledOp *out)
     }
 
     /* Look up process function. */
+    void ff_sws_process_0001_neon(void);
+    void ff_sws_process_0011_neon(void);
+    void ff_sws_process_0111_neon(void);
+    void ff_sws_process_1111_neon(void);
+
     const SwsOp *read  = ff_sws_op_list_input(&rest);
     const SwsOp *write = ff_sws_op_list_output(&rest);
     const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
     const int write_planes = write->rw.packed ? 1 : write->rw.elems;
-    SwsAArch64OpMask mask = 0;
-    for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
-        MASK_SET(mask, i, 1);
-
-    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, 
.mask = mask };
-    SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
-    if (!process_func) {
-        ret = AVERROR(ENOTSUP);
-        goto error;
+    SwsOpFunc process_func = NULL;
+    switch (FFMAX(read_planes, write_planes)) {
+    case 1: process_func = (SwsOpFunc) ff_sws_process_0001_neon; break;
+    case 2: process_func = (SwsOpFunc) ff_sws_process_0011_neon; break;
+    case 3: process_func = (SwsOpFunc) ff_sws_process_0111_neon; break;
+    case 4: process_func = (SwsOpFunc) ff_sws_process_1111_neon; break;
     }
 
-    out->func      = (SwsOpFunc) process_func;
+    out->func      = process_func;
     out->cpu_flags = chain->cpu_flags;
 
 error:
diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c
index a1e379967d..2c043dad65 100644
--- a/libswscale/aarch64/ops_asmgen.c
+++ b/libswscale/aarch64/ops_asmgen.c
@@ -272,12 +272,12 @@ static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], 
unsigned *count,
 }
 
 static unsigned clobbered_gprs(const SwsAArch64Context *s,
-                               const SwsAArch64OpImplParams *p,
+                               SwsAArch64OpMask mask,
                                RasmOp regs[MAX_SAVED_REGS])
 {
     unsigned count = 0;
     clobber_gpr(regs, &count, a64op_lr());
-    LOOP_MASK(p, i) {
+    LOOP(mask, i) {
         clobber_gpr(regs, &count, s->in[i]);
         clobber_gpr(regs, &count, s->out[i]);
         clobber_gpr(regs, &count, s->in_bump[i]);
@@ -286,7 +286,7 @@ static unsigned clobbered_gprs(const SwsAArch64Context *s,
     return count;
 }
 
-static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+static void asmgen_process(SwsAArch64Context *s, SwsAArch64OpMask mask)
 {
     RasmContext *r = s->rctx;
     char func_name[128];
@@ -297,13 +297,13 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
      * The description in x86/ops_common.asm mostly holds as well here.
      */
 
-    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
+    snprintf(func_name, sizeof(func_name), "ff_sws_process_%04x_neon", mask);
 
     rasm_func_begin(r, func_name, true, false);
 
     /* Function prologue */
     RasmOp saved_regs[MAX_SAVED_REGS];
-    unsigned nsaved = clobbered_gprs(s, p, saved_regs);
+    unsigned nsaved = clobbered_gprs(s, mask, saved_regs);
     if (nsaved)
         asmgen_prologue(s, saved_regs, nsaved);
 
@@ -312,19 +312,19 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
     i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl));               
CMT("SwsOpImpl *op1_impl = impl + 1;");
 
     /* Load values from exec. */
-    LOOP_MASK(p, i) {
+    LOOP(mask, i) {
         rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, 
i);
         i_ldr(r, s->in[i],       a64op_off(s->exec, offsetof_exec_in       + 
(i * sizeof(uint8_t *))));
     }
-    LOOP_MASK(p, i) {
+    LOOP(mask, i) {
         rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", 
i, i);
         i_ldr(r, s->out[i],      a64op_off(s->exec, offsetof_exec_out      + 
(i * sizeof(uint8_t *))));
     }
-    LOOP_MASK(p, i) {
+    LOOP(mask, i) {
         rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = 
exec->in_bump[%u];", i, i);
         i_ldr(r, s->in_bump[i],  a64op_off(s->exec, offsetof_exec_in_bump  + 
(i * sizeof(ptrdiff_t))));
     }
-    LOOP_MASK(p, i) {
+    LOOP(mask, i) {
         rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = 
exec->out_bump[%u];", i, i);
         i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + 
(i * sizeof(ptrdiff_t))));
     }
@@ -338,8 +338,8 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
 
     /* Perform padding, preparing for next row. */
     rasm_add_label(r, next_row);            CMT("next_row:");
-    LOOP_MASK(p, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
-    LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
+    LOOP(mask, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
+    LOOP(mask, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
 
     /* First row (reset x). */
     rasm_add_label(r, first_row);           CMT("first_row:");
@@ -1438,18 +1438,6 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     }
 }
 
-static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
-{
-    switch (p->op) {
-    case AARCH64_SWS_OP_PROCESS:
-        asmgen_process(s, p);
-        break;
-    default:
-        asmgen_op_cps(s, p);
-        break;
-    }
-}
-
 /*********************************************************************/
 static void aarch64_op_impl_lookup_str(char *buf, size_t size, const 
SwsAArch64OpImplParams *params,
                                        const SwsAArch64OpImplParams *prev, 
const char *p_str)
@@ -1641,10 +1629,16 @@ static int asmgen(void)
     s.in_bump [3] = a64op_gpx(26);
     s.out_bump[3] = a64op_gpx(27);
 
+    /* Generate all process functions using rasm. */
+    asmgen_process(&s, 0x0001);
+    asmgen_process(&s, 0x0011);
+    asmgen_process(&s, 0x0111);
+    asmgen_process(&s, 0x1111);
+
     /* Generate all functions from ops_entries.c using rasm. */
     const SwsAArch64OpImplParams *params = impl_params;
     while (params->op) {
-        asmgen_op(&s, params++);
+        asmgen_op_cps(&s, params++);
         if (rctx->error) {
             ret = rctx->error;
             goto error;
diff --git a/libswscale/aarch64/ops_entries.c b/libswscale/aarch64/ops_entries.c
index ae30ca8b57..04a665a9f1 100644
--- a/libswscale/aarch64/ops_entries.c
+++ b/libswscale/aarch64/ops_entries.c
@@ -3,10 +3,6 @@
  * To regenerate, run: make sws_ops_entries_aarch64
  */
 
-{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0001 },
-{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0011 },
-{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0111 },
-{ .op = AARCH64_SWS_OP_PROCESS, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 16, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_NIBBLE, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
diff --git a/libswscale/aarch64/ops_impl.c b/libswscale/aarch64/ops_impl.c
index 26d6a8d954..d5be4563c6 100644
--- a/libswscale/aarch64/ops_impl.c
+++ b/libswscale/aarch64/ops_impl.c
@@ -76,7 +76,6 @@ static const char 
*aarch64_pixel_type_name(SwsAArch64PixelType fmt)
 /*********************************************************************/
 static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
     [AARCH64_SWS_OP_NONE          ] = "AARCH64_SWS_OP_NONE",
-    [AARCH64_SWS_OP_PROCESS       ] = "AARCH64_SWS_OP_PROCESS",
     [AARCH64_SWS_OP_READ_BIT      ] = "AARCH64_SWS_OP_READ_BIT",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "AARCH64_SWS_OP_READ_NIBBLE",
     [AARCH64_SWS_OP_READ_PACKED   ] = "AARCH64_SWS_OP_READ_PACKED",
@@ -112,7 +111,6 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
 
 static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
     [AARCH64_SWS_OP_NONE          ] = "none",
-    [AARCH64_SWS_OP_PROCESS       ] = "process",
     [AARCH64_SWS_OP_READ_BIT      ] = "read_bit",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "read_nibble",
     [AARCH64_SWS_OP_READ_PACKED   ] = "read_packed",
@@ -323,7 +321,6 @@ static const ParamField field_dither_size_log2 = { 
PARAM_FIELD(dither.size_log2)
 /* Fields needed to uniquely identify each SwsAArch64OpType. */
 #define MAX_LEVELS 8
 static const ParamField *op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS] = {
-    [AARCH64_SWS_OP_PROCESS       ] = { &field_op,                             
                                                     &field_mask },
     [AARCH64_SWS_OP_READ_BIT      ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_NIBBLE   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_PACKED   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h
index f0bbc9f697..9ccacc60e7 100644
--- a/libswscale/aarch64/ops_impl.h
+++ b/libswscale/aarch64/ops_impl.h
@@ -37,7 +37,6 @@ typedef enum SwsAArch64PixelType {
 /* Similar to SwsOpType */
 typedef enum SwsAArch64OpType {
     AARCH64_SWS_OP_NONE = 0,
-    AARCH64_SWS_OP_PROCESS,
     AARCH64_SWS_OP_READ_BIT,
     AARCH64_SWS_OP_READ_NIBBLE,
     AARCH64_SWS_OP_READ_PACKED,
diff --git a/libswscale/tests/sws_ops_aarch64.c 
b/libswscale/tests/sws_ops_aarch64.c
index 84300c6af4..4fa10c7bb0 100644
--- a/libswscale/tests/sws_ops_aarch64.c
+++ b/libswscale/tests/sws_ops_aarch64.c
@@ -72,30 +72,6 @@ error:
     return ret;
 }
 
-/* Collect the parameters for the process function. */
-static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode 
**root)
-{
-    const SwsOp *read  = ff_sws_op_list_input(ops);
-    const SwsOp *write = ff_sws_op_list_output(ops);
-    const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
-    const int write_planes = write->rw.packed ? 1 : write->rw.elems;
-    int ret;
-
-    SwsAArch64OpMask mask = 0;
-    for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
-        MASK_SET(mask, i, 1);
-    SwsAArch64OpImplParams params = {
-        .op   = AARCH64_SWS_OP_PROCESS,
-        .mask = mask,
-    };
-
-    ret = aarch64_collect_op(&params, root);
-    if (ret < 0)
-        return ret;
-
-    return 0;
-}
-
 static int register_op(SwsContext *ctx, void *opaque, SwsOpList *ops)
 {
     struct AVTreeNode **root = (struct AVTreeNode **) opaque;
@@ -106,10 +82,6 @@ static int register_op(SwsContext *ctx, void *opaque, 
SwsOpList *ops)
     /* Use at most two full vregs during the widest precision section */
     int block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16;
 
-    ret = aarch64_collect_process(&rest, root);
-    if (ret < 0)
-        return ret;
-
     for (int i = 0; i < rest.num_ops; i++) {
         SwsAArch64OpImplParams params = { 0 };
         ret = convert_to_aarch64_impl(ctx, &rest, i, block_size, &params);
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/aarch64/ops: refactor process function (PR #23412)

Reply via email to