On Thu, 19 Jun 2025, Thirumalai Nagalingam wrote: > Hi Jeremy, > > Thanks again for the quick follow-up. `ldr` is the correct choice here, it's > a nice idea for reducing loads. > I've updated the patch to use it for loading stackaddr and stackbase. > Also added the Signed-off-by line to the commit message as requested. > > Patch is In-lined below and attached. > > In-lined patch: > > From 609cc27fa50700ab135dff421f08473c29dcb533 Mon Sep 17 00:00:00 2001 > From: Thirumalai Nagalingam <thirumalai.nagalin...@multicorewareinc.com> > Date: Fri, 20 Jun 2025 02:12:51 +0530 > Subject: [PATCH] Aarch64: Add inline assembly pthread wrapper > > This patch adds AArch64-specific inline assembly block for the pthread > wrapper used to bootstrap new threads. It sets up the thread stack, > adjusts for __CYGTLS_PADSIZE__, releases the original stack via > VirtualFree, and invokes the target thread function. > > Signed-off-by: Thirumalai Nagalingam > <thirumalai.nagalin...@multicorewareinc.com> > --- > winsup/cygwin/create_posix_thread.cc | 18 +++++++++++++++++- > 1 file changed, 17 insertions(+), 1 deletion(-) > > diff --git a/winsup/cygwin/create_posix_thread.cc > b/winsup/cygwin/create_posix_thread.cc > index 3fcd61707..592aaf1a5 100644 > --- a/winsup/cygwin/create_posix_thread.cc > +++ b/winsup/cygwin/create_posix_thread.cc > @@ -75,7 +75,7 @@ pthread_wrapper (PVOID arg) > /* Initialize new _cygtls. */ > _my_tls.init_thread (wrapper_arg.stackbase - __CYGTLS_PADSIZE__, > (DWORD (*)(void*, void*)) wrapper_arg.func); > -#ifdef __x86_64__ > +#if defined(__x86_64__) > __asm__ ("\n\ > leaq %[WRAPPER_ARG], %%rbx # Load &wrapper_arg into rbx \n\ > movq (%%rbx), %%r12 # Load thread func into r12 \n\ > @@ -99,6 +99,22 @@ pthread_wrapper (PVOID arg) > call *%%r12 # Call thread func \n" > : : [WRAPPER_ARG] "o" (wrapper_arg), > [CYGTLS] "i" (__CYGTLS_PADSIZE__)); > +#elif defined(__aarch64__) > + /* Sets up a new thread stack, frees the original OS stack, > + * and calls the thread function with its arg using AArch64 ABI. */ > + __asm__ __volatile__ ("\n\ > + mov x19, %[WRAPPER_ARG] // x19 = &wrapper_arg \n\ > + ldp x0, x10, [x19, #16] // x0 = stackaddr, x10 = stackbase \n\ > + sub sp, x10, %[CYGTLS] // sp = stackbase - (CYGTLS) \n\ > + mov fp, xzr // clear frame pointer (x29) \n\ > + mov x1, xzr // x1 = 0 (dwSize) \n\ > + mov x2, #0x8000 // x2 = MEM_RELEASE \n\ > + bl VirtualFree // free original stack \n\ > + ldp x19, x0, [x19] // x19 = func, x0 = arg \n\ > + blr x19 // call thread function \n" > + : : [WRAPPER_ARG] "r" (&wrapper_arg), > + [CYGTLS] "r" (__CYGTLS_PADSIZE__) > + : "x0", "x1", "x2", "x10", "x19", "x29", "memory"); > #else > #error unimplemented for this target > #endif >
LGTM. I'll wait at least a day before pushing in case somebody else has any objections.