This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 19250a184601742ac1b60795f0323692a445c048
Author:     Ramiro Polla <[email protected]>
AuthorDate: Mon Apr 13 15:28:32 2026 +0200
Commit:     Ramiro Polla <[email protected]>
CommitDate: Wed Jun 10 01:47:10 2026 +0200

    swscale/aarch64/ops: use plain `ret` instruction
    
    Use a call/ret pair instead of awkwardly exporting and then jumping
    back to the return label.
    
    This is similar to c29465bcb6, but for aarch64.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/ops.c           |  12 +---
 libswscale/aarch64/ops_asmgen.c    | 112 +++++++++++++++++++++----------------
 libswscale/aarch64/ops_entries.c   |   4 --
 libswscale/aarch64/ops_impl.c      |   3 -
 libswscale/aarch64/ops_impl.h      |   1 -
 libswscale/tests/sws_ops_aarch64.c |   7 +--
 6 files changed, 67 insertions(+), 72 deletions(-)

diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c
index a7e96b16c3..5a95792017 100644
--- a/libswscale/aarch64/ops.c
+++ b/libswscale/aarch64/ops.c
@@ -221,7 +221,7 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList 
*ops,
             goto error;
     }
 
-    /* Look up process/process_return functions. */
+    /* Look up process function. */
     const SwsOp *read  = ff_sws_op_list_input(&rest);
     const SwsOp *write = ff_sws_op_list_output(&rest);
     const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
@@ -230,19 +230,13 @@ static int aarch64_compile(SwsContext *ctx, const 
SwsOpList *ops,
     for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
         MASK_SET(mask, i, 1);
 
-    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS,    
    .mask = mask };
-    SwsAArch64OpImplParams return_params  = { .op = 
AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
+    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, 
.mask = mask };
     SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
-    SwsFuncPtr return_func  = ff_sws_aarch64_lookup(&return_params);
-    if (!process_func || !return_func) {
+    if (!process_func) {
         ret = AVERROR(ENOTSUP);
         goto error;
     }
 
-    ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
-    if (ret < 0)
-        goto error;
-
     out->func      = (SwsOpFunc) process_func;
     out->cpu_flags = chain->cpu_flags;
 
diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c
index 7c5bb83f46..7d4182c909 100644
--- a/libswscale/aarch64/ops_asmgen.c
+++ b/libswscale/aarch64/ops_asmgen.c
@@ -260,14 +260,14 @@ static void asmgen_epilogue(SwsAArch64Context *s, const 
RasmOp *regs, unsigned n
 }
 
 /*********************************************************************/
-/* Callee-saved registers (r19-r28). */
-#define MAX_SAVED_REGS 10
+/* Callee-saved registers (r19-r28, fp, and lr). */
+#define MAX_SAVED_REGS 12
 
 static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
                         RasmOp gpr)
 {
     const int n = a64op_gpr_n(gpr);
-    if (n >= 19 && n <= 28)
+    if (n >= 19 && n <= 30)
         regs[(*count)++] = gpr;
 }
 
@@ -276,6 +276,7 @@ static unsigned clobbered_gprs(const SwsAArch64Context *s,
                                RasmOp regs[MAX_SAVED_REGS])
 {
     unsigned count = 0;
+    clobber_gpr(regs, &count, a64op_lr());
     LOOP_MASK(p, i) {
         clobber_gpr(regs, &count, s->in[i]);
         clobber_gpr(regs, &count, s->out[i]);
@@ -292,9 +293,8 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
     char buf[64];
 
     /**
-     * The process/process_return functions for aarch64 work similarly
-     * to the x86 backend. The description in x86/ops_include.asm mostly
-     * holds as well here.
+     * The process function for aarch64 works similarly to the x86 backend.
+     * The description in x86/ops_include.asm mostly holds as well here.
      */
 
     aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
@@ -329,49 +329,38 @@ static void asmgen_process(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p
         i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + 
(i * sizeof(ptrdiff_t))));
     }
 
-    /* Reset x and jump to first kernel. */
-    i_mov(r, s->bx, s->bx_start);   CMT("bx = bx_start;");
-    i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
-    i_br (r, s->op0_func);          CMT("jump to op0_func");
-}
+    int first_row  = rasm_new_label(r, NULL);
+    int next_row   = rasm_new_label(r, NULL);
+    int next_block = rasm_new_label(r, NULL);
 
-static void asmgen_process_return(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
-{
-    RasmContext *r = s->rctx;
-    char func_name[128];
+    /* Jump to first row (skips padding). */
+    i_b  (r, rasm_op_label(first_row));     CMT("goto first_row;");
 
-    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
+    /* Perform padding, preparing for next row. */
+    rasm_add_label(r, next_row);            CMT("next_row:");
+    LOOP_MASK(p, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
+    LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
 
-    rasm_func_begin(r, func_name, true, true);
+    /* First row (reset x). */
+    rasm_add_label(r, first_row);           CMT("first_row:");
+    i_mov(r, s->bx, s->bx_start);           CMT("bx = bx_start;");
 
-    /* Reset impl to first kernel. */
+    /* Reset impl and call first kernel. */
+    rasm_add_label(r, next_block);          CMT("next_block:");
     i_mov(r, s->impl, s->op1_impl);         CMT("impl = op1_impl;");
+    i_blr(r, s->op0_func);                  CMT("op0_func();");
 
     /* Perform horizontal loop. */
-    int loop = rasm_new_label(r, NULL);
     i_add(r, s->bx, s->bx, IMM(1));         CMT("bx += 1;");
     i_cmp(r, s->bx, s->bx_end);             CMT("if (bx != bx_end)");
-    i_bne(r, loop);                         CMT("    goto loop;");
+    i_bne(r, next_block);                   CMT("    goto next_block;");
 
     /* Perform vertical loop. */
-    int end = rasm_new_label(r, NULL);
     i_add(r, s->y, s->y, IMM(1));           CMT("y += 1;");
-    i_cmp(r, s->y, s->y_end);               CMT("if (y == y_end)");
-    i_beq(r, end);                          CMT("    goto end;");
-
-    /* Perform padding and reset x, preparing for next row. */
-    LOOP_MASK(p, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
-    LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
-    i_mov(r, s->bx, s->bx_start);           CMT("bx = bx_start;");
-
-    /* Loop back or end of function. */
-    rasm_add_label(r, loop);                CMT("loop:");
-    i_br (r, s->op0_func);                  CMT("jump to op0_func");
-    rasm_add_label(r, end);                 CMT("end:");
+    i_cmp(r, s->y, s->y_end);               CMT("if (y != y_end)");
+    i_bne(r, next_row);                     CMT("    goto next_row;");
 
     /* Function epilogue */
-    RasmOp saved_regs[MAX_SAVED_REGS];
-    unsigned nsaved = clobbered_gprs(s, p, saved_regs);
     if (nsaved)
         asmgen_epilogue(s, saved_regs, nsaved);
 
@@ -1367,9 +1356,28 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
 {
     RasmContext *r = s->rctx;
 
+    bool is_read = false;
+    bool is_write = false;
+    switch (p->op) {
+    case AARCH64_SWS_OP_READ_BIT:
+    case AARCH64_SWS_OP_READ_NIBBLE:
+    case AARCH64_SWS_OP_READ_PACKED:
+    case AARCH64_SWS_OP_READ_PLANAR:
+        is_read = true;
+        break;
+    case AARCH64_SWS_OP_WRITE_BIT:
+    case AARCH64_SWS_OP_WRITE_NIBBLE:
+    case AARCH64_SWS_OP_WRITE_PACKED:
+    case AARCH64_SWS_OP_WRITE_PLANAR:
+        is_write = true;
+        break;
+    default:
+        break;
+    }
+
     char func_name[128];
     aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
-    rasm_func_begin(r, func_name, true, true);
+    rasm_func_begin(r, func_name, true, !is_read);
 
     /**
      * Set up vector register dimensions and reshape all vectors
@@ -1416,14 +1424,18 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
         break;
     }
 
-    /* Load continuation address and increment impl pointer. */
-    RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
-    RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
-    i_ldr(r, s->cont, impl_post);                   CMT("SwsFuncPtr cont = 
(impl++)->cont;");
-    rasm_set_current_node(r, node);
-
-    /* Common end for CPS functions. */
-    i_br (r, s->cont);                              CMT("jump to cont");
+    if (is_write) {
+        /* Write functions return directly. */
+        i_ret(r);
+    } else {
+        /* Load continuation address and increment impl pointer. */
+        RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
+        RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
+        i_ldr(r, s->cont, impl_post);                   CMT("SwsFuncPtr cont = 
(impl++)->cont;");
+        rasm_set_current_node(r, node);
+        /* Common end for remaining CPS functions. */
+        i_br (r, s->cont);                              CMT("jump to cont");
+    }
 }
 
 static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
@@ -1432,9 +1444,6 @@ static void asmgen_op(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     case AARCH64_SWS_OP_PROCESS:
         asmgen_process(s, p);
         break;
-    case AARCH64_SWS_OP_PROCESS_RETURN:
-        asmgen_process_return(s, p);
-        break;
     default:
         asmgen_op_cps(s, p);
         break;
@@ -1561,9 +1570,11 @@ static int asmgen(void)
 
     /**
      * The entry point of the SwsOpFunc is the `process` function. The
+     * first kernel function is called from `process`, and subsequent
      * kernel functions are chained by directly branching to the next
-     * operation, using a continuation-passing style design. The exit
-     * point of the SwsOpFunc is the `process_return` function.
+     * operation, using a continuation-passing style design. The last
+     * operation must be a write operation, which returns from the call
+     * to the `process` function.
      *
      * The GPRs used by the entire call-chain are listed below.
      *
@@ -1586,6 +1597,9 @@ static int asmgen(void)
      * The read/write data pointers and padding values first use up the
      * remaining free caller-saved registers, and only then are the
      * caller-saved registers (r19-r28) used.
+     *
+     * The Link Register (r30) is used when calling the first kernel,
+     * so it must be saved.
      */
 
     /* SwsOpFunc arguments. */
diff --git a/libswscale/aarch64/ops_entries.c b/libswscale/aarch64/ops_entries.c
index 70aad8ae89..ae30ca8b57 100644
--- a/libswscale/aarch64/ops_entries.c
+++ b/libswscale/aarch64/ops_entries.c
@@ -7,10 +7,6 @@
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0011 },
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x0111 },
 { .op = AARCH64_SWS_OP_PROCESS, .mask = 0x1111 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0001 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0011 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x0111 },
-{ .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = 0x1111 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_BIT, .block_size = 16, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
 { .op = AARCH64_SWS_OP_READ_NIBBLE, .block_size = 8, .type = AARCH64_PIXEL_U8, 
.mask = 0x0001 },
diff --git a/libswscale/aarch64/ops_impl.c b/libswscale/aarch64/ops_impl.c
index f7e7b18dcf..26d6a8d954 100644
--- a/libswscale/aarch64/ops_impl.c
+++ b/libswscale/aarch64/ops_impl.c
@@ -77,7 +77,6 @@ static const char 
*aarch64_pixel_type_name(SwsAArch64PixelType fmt)
 static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
     [AARCH64_SWS_OP_NONE          ] = "AARCH64_SWS_OP_NONE",
     [AARCH64_SWS_OP_PROCESS       ] = "AARCH64_SWS_OP_PROCESS",
-    [AARCH64_SWS_OP_PROCESS_RETURN] = "AARCH64_SWS_OP_PROCESS_RETURN",
     [AARCH64_SWS_OP_READ_BIT      ] = "AARCH64_SWS_OP_READ_BIT",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "AARCH64_SWS_OP_READ_NIBBLE",
     [AARCH64_SWS_OP_READ_PACKED   ] = "AARCH64_SWS_OP_READ_PACKED",
@@ -114,7 +113,6 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
 static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
     [AARCH64_SWS_OP_NONE          ] = "none",
     [AARCH64_SWS_OP_PROCESS       ] = "process",
-    [AARCH64_SWS_OP_PROCESS_RETURN] = "process_return",
     [AARCH64_SWS_OP_READ_BIT      ] = "read_bit",
     [AARCH64_SWS_OP_READ_NIBBLE   ] = "read_nibble",
     [AARCH64_SWS_OP_READ_PACKED   ] = "read_packed",
@@ -326,7 +324,6 @@ static const ParamField field_dither_size_log2 = { 
PARAM_FIELD(dither.size_log2)
 #define MAX_LEVELS 8
 static const ParamField *op_fields[AARCH64_SWS_OP_TYPE_NB][MAX_LEVELS] = {
     [AARCH64_SWS_OP_PROCESS       ] = { &field_op,                             
                                                     &field_mask },
-    [AARCH64_SWS_OP_PROCESS_RETURN] = { &field_op,                             
                                                     &field_mask },
     [AARCH64_SWS_OP_READ_BIT      ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_NIBBLE   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
     [AARCH64_SWS_OP_READ_PACKED   ] = { &field_op,                             
                     &field_block_size, &field_type, &field_mask },
diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h
index 67c4672812..f0bbc9f697 100644
--- a/libswscale/aarch64/ops_impl.h
+++ b/libswscale/aarch64/ops_impl.h
@@ -38,7 +38,6 @@ typedef enum SwsAArch64PixelType {
 typedef enum SwsAArch64OpType {
     AARCH64_SWS_OP_NONE = 0,
     AARCH64_SWS_OP_PROCESS,
-    AARCH64_SWS_OP_PROCESS_RETURN,
     AARCH64_SWS_OP_READ_BIT,
     AARCH64_SWS_OP_READ_NIBBLE,
     AARCH64_SWS_OP_READ_PACKED,
diff --git a/libswscale/tests/sws_ops_aarch64.c 
b/libswscale/tests/sws_ops_aarch64.c
index ca6279e8cf..84300c6af4 100644
--- a/libswscale/tests/sws_ops_aarch64.c
+++ b/libswscale/tests/sws_ops_aarch64.c
@@ -72,7 +72,7 @@ error:
     return ret;
 }
 
-/* Collect the parameters for the process/process_return functions. */
+/* Collect the parameters for the process function. */
 static int aarch64_collect_process(const SwsOpList *ops, struct AVTreeNode 
**root)
 {
     const SwsOp *read  = ff_sws_op_list_input(ops);
@@ -89,11 +89,6 @@ static int aarch64_collect_process(const SwsOpList *ops, 
struct AVTreeNode **roo
         .mask = mask,
     };
 
-    ret = aarch64_collect_op(&params, root);
-    if (ret < 0)
-        return ret;
-
-    params.op = AARCH64_SWS_OP_PROCESS_RETURN;
     ret = aarch64_collect_op(&params, root);
     if (ret < 0)
         return ret;

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to