On Thu, 19 Jun 2025, Thirumalai Nagalingam wrote:

> Hi Jeremy,
>
> Thanks again for the quick follow-up. `ldr` is the correct choice here, it's 
> a nice idea for reducing loads.
> I've updated the patch to use it for loading stackaddr and stackbase.
> Also added the Signed-off-by line to the commit message as requested.
>
> Patch is In-lined below and attached.
>
> In-lined patch:
>
> From 609cc27fa50700ab135dff421f08473c29dcb533 Mon Sep 17 00:00:00 2001
> From: Thirumalai Nagalingam <thirumalai.nagalin...@multicorewareinc.com>
> Date: Fri, 20 Jun 2025 02:12:51 +0530
> Subject: [PATCH] Aarch64: Add inline assembly pthread wrapper
>
> This patch adds AArch64-specific inline assembly block for the pthread
> wrapper used to bootstrap new threads. It sets up the thread stack,
> adjusts for __CYGTLS_PADSIZE__, releases the original stack via
> VirtualFree, and invokes the target thread function.
>
> Signed-off-by: Thirumalai Nagalingam 
> <thirumalai.nagalin...@multicorewareinc.com>
> ---
>  winsup/cygwin/create_posix_thread.cc | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/winsup/cygwin/create_posix_thread.cc 
> b/winsup/cygwin/create_posix_thread.cc
> index 3fcd61707..592aaf1a5 100644
> --- a/winsup/cygwin/create_posix_thread.cc
> +++ b/winsup/cygwin/create_posix_thread.cc
> @@ -75,7 +75,7 @@ pthread_wrapper (PVOID arg)
>    /* Initialize new _cygtls. */
>    _my_tls.init_thread (wrapper_arg.stackbase - __CYGTLS_PADSIZE__,
>                      (DWORD (*)(void*, void*)) wrapper_arg.func);
> -#ifdef __x86_64__
> +#if defined(__x86_64__)
>    __asm__ ("\n\
>          leaq  %[WRAPPER_ARG], %%rbx  # Load &wrapper_arg into rbx    \n\
>          movq  (%%rbx), %%r12         # Load thread func into r12     \n\
> @@ -99,6 +99,22 @@ pthread_wrapper (PVOID arg)
>          call  *%%r12                 # Call thread func              \n"
>          : : [WRAPPER_ARG] "o" (wrapper_arg),
>              [CYGTLS] "i" (__CYGTLS_PADSIZE__));
> +#elif defined(__aarch64__)
> +  /* Sets up a new thread stack, frees the original OS stack,
> +   * and calls the thread function with its arg using AArch64 ABI. */
> +  __asm__ __volatile__ ("\n\
> +        mov     x19, %[WRAPPER_ARG]  // x19 = &wrapper_arg              \n\
> +        ldp     x0, x10, [x19, #16]  // x0 = stackaddr, x10 = stackbase \n\
> +        sub     sp, x10, %[CYGTLS]   // sp = stackbase - (CYGTLS)       \n\
> +        mov     fp, xzr              // clear frame pointer (x29)       \n\
> +        mov     x1, xzr              // x1 = 0 (dwSize)                 \n\
> +        mov     x2, #0x8000          // x2 = MEM_RELEASE                \n\
> +        bl      VirtualFree          // free original stack             \n\
> +        ldp     x19, x0, [x19]       // x19 = func, x0 = arg            \n\
> +        blr     x19                  // call thread function            \n"
> +        : : [WRAPPER_ARG] "r" (&wrapper_arg),
> +            [CYGTLS] "r" (__CYGTLS_PADSIZE__)
> +        : "x0", "x1", "x2", "x10", "x19", "x29", "memory");
>  #else
>  #error unimplemented for this target
>  #endif
>

LGTM.  I'll wait at least a day before pushing in case somebody else has
any objections.

Reply via email to