Hi all,
atached is a work-in-progress patch to include the LWP private data
pointer in ucontext. Platforms that don't require special pmap magic
define __lwp_getprivate_fast and __HAVE___LWP_GETPRIVATE_FAST in
machine/types.h, this is used by libpthread and will later be used by
the TLS support code in rtld and libc.

amd64, i386 and sh3 are tested and committable. This includes the change
to libpthread, which fixes the stack related issues on this platforms.

The mcontext part except __lwp_getprivate_fast on the platforms without
existing entry in mcontext needs another pass to ensure that the size
doesn't change, but is otherwise safe to commit. I don't plan to hook up
__lwp_getprivate_fast at this point or the changes like
cpu_lwp_setprivate. It needs careful checking and tests by someone with
the hardware. In many cases, the thread register is currently not
updated on context switch.

Someone with ARM and M68K knowledge has to decide if SMP support is
desirable and if yes, how to teach the pmap about cpu specific page.

The only platform left out is VAX since it doesn't have any space in the
mcontext and I don't know if any existing entry can be abused/reused.

Joerg
Index: src/lib/libc/arch/alpha/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/alpha/gen/_lwp.c
+++ src/lib/libc/arch/alpha/gen/_lwp.c
@@ -58,6 +58,9 @@
 	gr[_REG_T12] = (unsigned long) start;
 	gr[_REG_RA] = (unsigned long) _lwp_exit;
 	gr[_REG_A0] = (unsigned long) arg;
 	gr[_REG_SP] = ((unsigned long) (stack_base + stack_size)) & ~0x7;
 	gr[_REG_S6] = 0;
+	gr[_REG_UNIQUE] = (unsigned long)private;
+
+	u->uc_flags |= _UC_UNIQUE;
 }

Index: src/lib/libc/arch/arm/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/arm/gen/_lwp.c
+++ src/lib/libc/arch/arm/gen/_lwp.c
@@ -66,6 +66,8 @@
 
 	u->uc_mcontext.__gregs[_REG_R0] = (__greg_t) arg;
 	u->uc_mcontext.__gregs[_REG_SP] = ((__greg_t) sp) & ~7;
 	u->uc_mcontext.__gregs[_REG_LR] = (__greg_t) _lwp_exit;
 	u->uc_mcontext.__gregs[_REG_PC] = (__greg_t) start;
+	u->uc_mcontext._mc_tlsbase = (uintptr_t)private;
+	u->uc_flags |= _UC_TLSBASE;
 }

Index: src/lib/libc/arch/hppa/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/hppa/gen/_lwp.c
+++ src/lib/libc/arch/hppa/gen/_lwp.c
@@ -67,6 +67,7 @@
 	gr[_REG_PCOQH] = fp | HPPA_PC_PRIV_USER;
 	gr[_REG_PCOQT] = (fp + 4) | HPPA_PC_PRIV_USER;
 	gr[_REG_RP] = (__greg_t) _lwp_exit;
 	gr[_REG_ARG0] = (__greg_t) arg;
 	gr[_REG_SP] = (__greg_t) sp;
+	gr[_REG_CR27] = (__greg_t) private;
 }

Index: src/lib/libc/arch/i386/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/i386/gen/_lwp.c
+++ src/lib/libc/arch/i386/gen/_lwp.c
@@ -64,7 +64,8 @@
 	*--sp = (void *) _lwp_exit;
 	
 	/* LINTED uintptr_t is safe */
 	u->uc_mcontext.__gregs[_REG_UESP] = (uintptr_t) sp;
 
-	/* LINTED private is currently unused */
+	u->uc_mcontext._mc_tlsbase = (uintptr_t)private;
+	u->uc_flags |= _UC_TLSBASE;
 }

Index: src/lib/libc/arch/m68k/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/m68k/gen/_lwp.c
+++ src/lib/libc/arch/m68k/gen/_lwp.c
@@ -58,6 +58,8 @@
 	
 	*--sp = arg;
 	*--sp = (void *) _lwp_exit;
 
 	u->uc_mcontext.__gregs[_REG_A7] = (int) sp;
+	u->uc_mcontext._mc_tlsbase = (uintptr_t)private;
+	u->uc_flags |= _UC_TLSBASE;
 }

Index: src/lib/libc/arch/mips/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/mips/gen/_lwp.c
+++ src/lib/libc/arch/mips/gen/_lwp.c
@@ -59,6 +59,8 @@
 	gr[_REG_EPC] = (unsigned long) start;
 	gr[_REG_T9] = (unsigned long) start; /* required for .abicalls */
 	gr[_REG_RA] = (unsigned long) _lwp_exit;
 	gr[_REG_A0] = (unsigned long) arg;
 	gr[_REG_SP] = (unsigned long) sp;
+	u->uc_mcontext._mc_tlsbase = (uintptr_t)private;
+	u->uc_flags |= _UC_TLSBASE;
 }

Index: src/lib/libc/arch/powerpc/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/powerpc/gen/_lwp.c
+++ src/lib/libc/arch/powerpc/gen/_lwp.c
@@ -62,6 +62,7 @@
 
 	u->uc_mcontext.__gregs[3] = (int) arg;		/* arg1 */
 	u->uc_mcontext.__gregs[1] = ((int) sp) - 12;	/* stack */
 	u->uc_mcontext.__gregs[33] = (int) _lwp_exit;	/* LR */
 	u->uc_mcontext.__gregs[34] = (int) start;	/* PC */
+	u->uc_mcontext.__gregs[_REG_R2] = (__greg_t) private;
 }

Index: src/lib/libc/arch/sh3/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/sh3/gen/_lwp.c
+++ src/lib/libc/arch/sh3/gen/_lwp.c
@@ -62,6 +62,7 @@
 
 	u->uc_mcontext.__gregs[_REG_R4] = (__greg_t) arg;
 	u->uc_mcontext.__gregs[_REG_SP] = ((__greg_t) sp) & ~3;
 	u->uc_mcontext.__gregs[_REG_PR] = (__greg_t) _lwp_exit;
 	u->uc_mcontext.__gregs[_REG_PC] = (__greg_t) start;
+	u->uc_mcontext.__gregs[_REG_GBR] = (__greg_t) private;
 }

Index: src/lib/libc/arch/sparc/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/sparc/gen/_lwp.c
+++ src/lib/libc/arch/sparc/gen/_lwp.c
@@ -63,11 +63,12 @@
 	gr[_REG_PC] = (ulong) start;
 	gr[_REG_nPC] = (ulong) start + 4;
 	gr[_REG_O0] = (ulong)arg;
 	gr[_REG_O6] = (ulong)sp;
 	gr[_REG_O7] = (ulong)_lwp_exit - 8;
+	gr[_REG_G7] = (ulong)private;
 
 	/* XXX: uwe: why do we need this? */
 	/* create loopback in the window save area on the stack? */
 	sp[8+6] = (ulong)sp;		/* %i6 */
 	sp[8+7] = (ulong)_lwp_exit - 8;	/* %i7 */
 }

Index: src/lib/libc/arch/sparc64/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/sparc64/gen/_lwp.c
+++ src/lib/libc/arch/sparc64/gen/_lwp.c
@@ -68,6 +68,7 @@
 	gr[_REG_nPC] = (ulong) start + 4;
 
 	gr[_REG_O0] = (ulong) arg;
 	gr[_REG_O6] = (ulong) sp;
 	gr[_REG_O7] = (ulong)_lwp_exit - 8;
+	gr[_REG_G7] = (ulong)private;
 }

Index: src/lib/libc/arch/x86_64/gen/_lwp.c
===================================================================
--- src/lib/libc/arch/x86_64/gen/_lwp.c
+++ src/lib/libc/arch/x86_64/gen/_lwp.c
@@ -64,7 +64,8 @@
 	*--sp = (void *) _lwp_exit;
 	
 	/* LINTED uintptr_t is safe */
 	gr[_REG_URSP] = (uintptr_t) sp;
 
-	/* LINTED private is currently unused */
+	u->uc_mcontext._mc_tlsbase = (uintptr_t)private;
+	u->uc_flags |= _UC_TLSBASE;
 }

Index: src/lib/libpthread/arch/i386/pthread_md.h
===================================================================
--- src/lib/libpthread/arch/i386/pthread_md.h
+++ src/lib/libpthread/arch/i386/pthread_md.h
@@ -73,29 +73,13 @@
 		(ucp)->uc_mcontext.__gregs[_REG_EFL] =			\
 		    ucur.uc_mcontext.__gregs[_REG_EFL];			\
 	} while (/*CONSTCOND*/0);
 
 #define	pthread__smt_pause()	__asm __volatile("rep; nop" ::: "memory")
-/*	#define	PTHREAD__HAVE_THREADREG	*/
 
 /* Don't need additional memory barriers. */
 #define	PTHREAD__ATOMIC_IS_MEMBAR
-
-static inline pthread_t
-#ifdef __GNUC__
-__attribute__ ((__const__))
-#endif
-pthread__threadreg_get(void)
-{
-	pthread_t self;
-
-	__asm volatile("movl %%gs:0, %0"
-		: "=r" (self)
-		:);
-
-	return self;
-}
 
 static inline void *
 _atomic_cas_ptr(volatile void *ptr, void *old, void *new)
 {
 	volatile uintptr_t *cast = ptr;

Index: src/lib/libpthread/pthread_int.h
===================================================================
--- src/lib/libpthread/pthread_int.h
+++ src/lib/libpthread/pthread_int.h
@@ -248,12 +248,16 @@
 	(ucp)->uc_flags = _UC_CPU | _UC_STACK;				\
 	_INITCONTEXT_U_MD(ucp)						\
 	} while (/*CONSTCOND*/0)
 
 
-#ifdef PTHREAD__HAVE_THREADREG
-#define	pthread__self()		pthread__threadreg_get()
+#ifdef __HAVE___LWP_GETPRIVATE_FAST
+static inline pthread_t __constfunc
+pthread__self(void)
+{
+	return (pthread_t)__lwp_getprivate_fast();
+}
 #else
 /* Stack location of pointer to a particular thread */
 extern vaddr_t	pthread__mainbase;
 extern vaddr_t	pthread__mainstruct;
 static inline pthread_t

Index: src/sys/arch/alpha/alpha/machdep.c
===================================================================
--- src/sys/arch/alpha/alpha/machdep.c
+++ src/sys/arch/alpha/alpha/machdep.c
@@ -1926,16 +1926,12 @@
 		else
 			pcb->pcb_hw.apcb_usp = gr[_REG_SP];
 		frame->tf_regs[FRAME_PC] = gr[_REG_PC];
 		frame->tf_regs[FRAME_PS] = gr[_REG_PS];
 	}
-	if (flags & _UC_UNIQUE) {
-		if (l == curlwp)
-			alpha_pal_wrunique(gr[_REG_UNIQUE]);
-		else
-			pcb->pcb_hw.apcb_unique = gr[_REG_UNIQUE];
-	}
+	if (flags & _UC_UNIQUE)
+		lwp_setprivate(l, (void *)(uintptr_t)gr[_REG_UNIQUE]);
 	/* Restore floating point register context, if any. */
 	if (flags & _UC_FPU) {
 		/* If we have an FP register context, get rid of it. */
 		if (pcb->pcb_fpcpu != NULL)
 			fpusave_proc(l, 0);

Index: src/sys/arch/alpha/alpha/sys_machdep.c
===================================================================
--- src/sys/arch/alpha/alpha/sys_machdep.c
+++ src/sys/arch/alpha/alpha/sys_machdep.c
@@ -236,7 +236,9 @@
 {
 	struct pcb *pcb;
 
 	pcb = lwp_getpcb(l);
 	pcb->pcb_hw.apcb_unique = (unsigned long)addr;
+	if (l == curlwp)
+		alpha_pal_wrunique(pcb->pcb_hw.apcb_unique);
 	return 0;
 }

Index: src/sys/arch/alpha/include/mcontext.h
===================================================================
--- src/sys/arch/alpha/include/mcontext.h
+++ src/sys/arch/alpha/include/mcontext.h
@@ -98,7 +98,19 @@
 #define _UC_MACHINE_SP(uc)	((uc)->uc_mcontext.__gregs[_REG_SP])
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_PC])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_V0])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__tmp __asm("$0");
+
+	__asm volatile("call_pal %1 # PAL_rdunique"
+		: "=r" (__tmp)
+		: "i" (0x009e /* PAL_rdunique */));
+
+	return __tmp;
+}
 
 #endif	/* !_ALPHA_MCONTEXT_H_ */

Index: src/sys/arch/alpha/include/types.h
===================================================================
--- src/sys/arch/alpha/include/types.h
+++ src/sys/arch/alpha/include/types.h
@@ -74,11 +74,12 @@
 #define	__HAVE_SYSCALL_INTERN
 #define	__HAVE_MINIMAL_EMUL
 #define	__HAVE_AST_PERPROC
 #define	__HAVE_ATOMIC64_OPS
 #define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #if defined(_KERNEL)
 #define	__HAVE_RAS
 #endif
 
 #endif	/* _MACHTYPES_H_ */

Index: src/sys/arch/amd64/amd64/machdep.c
===================================================================
--- src/sys/arch/amd64/amd64/machdep.c
+++ src/sys/arch/amd64/amd64/machdep.c
@@ -1605,10 +1605,13 @@
 	    (void *) mcp->__gregs[_REG_RIP])) != -1)
 		mcp->__gregs[_REG_RIP] = ras_rip;
 
 	*flags |= _UC_CPU;
 
+	mcp->_mc_tlsbase = (uintptr_t)l->l_private;;
+	*flags |= _UC_TLSBASE;
+
 	if ((l->l_md.md_flags & MDP_USEDFPU) != 0) {
 		struct pcb *pcb = lwp_getpcb(l);
 
 		if (pcb->pcb_fpcpu) {
 			fpusave_lwp(l, true);
@@ -1671,10 +1674,13 @@
 	if ((flags & _UC_FPU) != 0) {
 		memcpy(&pcb->pcb_savefpu.fp_fxsave, mcp->__fpregs,
 		    sizeof (mcp->__fpregs));
 		l->l_md.md_flags |= MDP_USEDFPU;
 	}
+
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
 
 	mutex_enter(p->p_lock);
 	if (flags & _UC_SETSTACK)
 		l->l_sigstk.ss_flags |= SS_ONSTACK;
 	if (flags & _UC_CLRSTACK)

Index: src/sys/arch/amd64/amd64/netbsd32_machdep.c
===================================================================
--- src/sys/arch/amd64/amd64/netbsd32_machdep.c
+++ src/sys/arch/amd64/amd64/netbsd32_machdep.c
@@ -155,11 +155,11 @@
 
 	tf = l->l_md.md_regs;
 	tf->tf_ds = LSEL(LUDATA32_SEL, SEL_UPL);
 	tf->tf_es = LSEL(LUDATA32_SEL, SEL_UPL);
 	cpu_fsgs_zero(l);
-	cpu_fsgs_reload(l, tf->tf_ds, tf->tf_ds);
+	cpu_fsgs_reload(l, tf->tf_ds, tf->tf_es);
 	tf->tf_rdi = 0;
 	tf->tf_rsi = 0;
 	tf->tf_rbp = 0;
 	tf->tf_rbx = (uint64_t)p->p_psstr;
 	tf->tf_rdx = 0;
@@ -857,10 +857,13 @@
 		tf->tf_cs     = gr[_REG32_CS];
 		tf->tf_rsp    = gr[_REG32_UESP];
 		tf->tf_ss     = gr[_REG32_SS];
 	}
 
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
+
 	/* Restore floating point register context, if any. */
 	if ((flags & _UC_FPU) != 0) {
 		struct pcb *pcb = lwp_getpcb(l);
 
 		/*
@@ -868,11 +871,11 @@
 		 */
 		if (pcb->pcb_fpcpu != NULL) {
 			fpusave_lwp(l, false);
 		}
 		memcpy(&pcb->pcb_savefpu.fp_fxsave, &mcp->__fpregs,
-		    sizeof (mcp->__fpregs));
+		    sizeof (pcb->pcb_savefpu.fp_fxsave));
 		/* If not set already. */
 		l->l_md.md_flags |= MDP_USEDFPU;
 	}
 
 	mutex_enter(p->p_lock);
@@ -917,19 +920,22 @@
 	    (void *) (uintptr_t)gr[_REG32_EIP])) != -1)
 		gr[_REG32_EIP] = ras_eip;
 
 	*flags |= _UC_CPU;
 
+	mcp->_mc_tlsbase = (uint32_t)(uintptr_t)l->l_private;
+	*flags |= _UC_TLSBASE;
+
 	/* Save floating point register context, if any. */
 	if ((l->l_md.md_flags & MDP_USEDFPU) != 0) {
 		struct pcb *pcb = lwp_getpcb(l);
 
 		if (pcb->pcb_fpcpu) {
 			fpusave_lwp(l, true);
 		}
 		memcpy(&mcp->__fpregs, &pcb->pcb_savefpu.fp_fxsave,
-		    sizeof (mcp->__fpregs));
+		    sizeof (pcb->pcb_savefpu.fp_fxsave));
 		*flags |= _UC_FPU;
 	}
 }
 
 void

Index: src/sys/arch/amd64/include/mcontext.h
===================================================================
--- src/sys/arch/amd64/include/mcontext.h
+++ src/sys/arch/amd64/include/mcontext.h
@@ -60,11 +60,11 @@
  * within ucontext_t.
  */
 
 typedef struct {
 	__gregset_t	__gregs;
-	long 		__pad;
+	__greg_t	_mc_tlsbase;
 	__fpregset_t	__fpregs;
 } mcontext_t;
 
 #define _UC_UCONTEXT_ALIGN	(~0xf)
 
@@ -72,16 +72,27 @@
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_RIP])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_RAX])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
 
+#define	_UC_TLSBASE	0x00080000
+
 /*
  * mcontext extensions to handle signal delivery.
  */
 #define _UC_SETSTACK	0x00010000
 #define _UC_CLRSTACK	0x00020000
 
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	void *__tmp;
+
+	__asm volatile("movq %%fs:0, %0" : "=r" (__tmp));
+
+	return __tmp;
+}
 
 #ifdef _KERNEL
 
 /*
  * 32bit context definitions.
@@ -114,18 +125,35 @@
 #define _UC_MACHINE32_SP(uc)	((uc)->uc_mcontext.__gregs[_REG32_UESP])
 
 /*
  * Floating point register state
  */
-typedef struct fxsave64 __fpregset32_t;
+typedef struct {
+	union {
+		struct {
+			int	__fp_state[27];	/* Environment and registers */
+			int	__fp_status;	/* Software status word */
+		} __fpchip_state;
+		struct {
+			char	__fp_emul[246];
+			char	__fp_epad[2];
+		} __fp_emul_space;
+		struct {
+			char	__fp_xmm[512];
+		} __fp_xmm_state;
+		int	__fp_fpregs[128];
+	} __fp_reg_set;
+	int	__fp_wregs[33];			/* Weitek? */
+} __fpregset32_t;
 
 typedef struct {
 	__gregset32_t	__gregs;
 	__fpregset32_t	__fpregs;
+	uint32_t	_mc_tlsbase;
 } mcontext32_t;
 
-#define _UC_MACHINE_PAD32	5
+#define _UC_MACHINE_PAD32	4
 
 struct trapframe;
 struct lwp;
 int check_mcontext(struct lwp *, const mcontext_t *, struct trapframe *);
 

Index: src/sys/arch/amd64/include/types.h
===================================================================
--- src/sys/arch/amd64/include/types.h
+++ src/sys/arch/amd64/include/types.h
@@ -83,10 +83,11 @@
 #define	__HAVE_SYSCALL_INTERN
 #define	__HAVE_MINIMAL_EMUL
 #define	__HAVE_ATOMIC64_OPS
 #define	__HAVE_ATOMIC_AS_MEMBAR
 #define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE___LWP_GETPRIVATE_FAST
 #define	__HAVE_INTR_CONTROL
 
 #ifdef _KERNEL_OPT
 #include "opt_xen.h"
 #define	__HAVE_RAS

Index: src/sys/arch/arm/arm/sig_machdep.c
===================================================================
--- src/sys/arch/arm/arm/sig_machdep.c
+++ src/sys/arch/arm/arm/sig_machdep.c
@@ -197,10 +197,13 @@
 #ifdef ARMFPE
 	/* Save Floating Point Register context. */
 	arm_fpe_getcontext(p, (struct fpreg *)(void *)&mcp->fpregs);
 	*flags |= _UC_FPU;
 #endif
+
+	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
+	*flags |= _UC_TLSBASE;
 }
 
 int
 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
 {
@@ -237,10 +240,13 @@
 	if ((flags & _UC_FPU) != 0) {
 		/* Restore Floating Point Register context. */
 		arm_fpe_setcontext(p, (struct fpreg *)(void *)&mcp->__fpregs);
 	}
 #endif
+
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
 
 	mutex_enter(p->p_lock);
 	if (flags & _UC_SETSTACK)
 		l->l_sigstk.ss_flags |= SS_ONSTACK;
 	if (flags & _UC_CLRSTACK)

Index: src/sys/arch/arm/include/mcontext.h
===================================================================
--- src/sys/arch/arm/include/mcontext.h
+++ src/sys/arch/arm/include/mcontext.h
@@ -85,23 +85,26 @@
 	__gregset_t	__gregs;
 	union {
 		__fpregset_t __fpregs;
 		__vfpregset_t __vfpregs;
 	} __fpu;
+	__greg_t	_mc_tlsbase;
 } mcontext_t;
 
 /* Machine-dependent uc_flags */
 #define	_UC_ARM_VFP	0x00010000	/* FPU field is VFP */
 
 /* used by signal delivery to indicate status of signal stack */
 #define _UC_SETSTACK	0x00020000
 #define _UC_CLRSTACK	0x00040000
 
-#define _UC_MACHINE_PAD	3		/* Padding appended to ucontext_t */
+#define	_UC_TLSBASE	0x00080000
+
+#define _UC_MACHINE_PAD	2		/* Padding appended to ucontext_t */
 
 #define _UC_MACHINE_SP(uc)	((uc)->uc_mcontext.__gregs[_REG_SP])
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_PC])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_R0])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
 
 #endif	/* !_ARM_MCONTEXT_H_ */

Index: src/sys/arch/hppa/hppa/hppa_machdep.c
===================================================================
--- src/sys/arch/hppa/hppa/hppa_machdep.c
+++ src/sys/arch/hppa/hppa/hppa_machdep.c
@@ -186,13 +186,13 @@
 	gr[_REG_SR0] = tf->tf_sr0;
 	gr[_REG_SR1] = tf->tf_sr1;
 	gr[_REG_SR2] = tf->tf_sr2;
 	gr[_REG_SR3] = tf->tf_sr3;
 	gr[_REG_SR4] = tf->tf_sr4;
+	gr[_REG_CR27] = tf->tf_cr27;
 #if 0
 	gr[_REG_CR26] = tf->tf_cr26;
-	gr[_REG_CR27] = tf->tf_cr27;
 #endif
 
 	ras_pc = (__greg_t)ras_lookup(l->l_proc,
 	    (void *)(gr[_REG_PCOQH] & ~HPPA_PC_PRIV_MASK));
 	if (ras_pc != -1) {
@@ -301,18 +301,20 @@
 			tf->tf_iioq_tail &= ~HPPA_PC_PRIV_MASK;
 		} else {
 			tf->tf_iioq_tail |= HPPA_PC_PRIV_USER;
 		}
 
+		lwp_setprivate(l, (void *)(uintptr_t)gr[_REG_CR27]);
+		tf->tf_cr27	= gr[_REG_CR27];
+
 #if 0
 		tf->tf_sr0	= gr[_REG_SR0];
 		tf->tf_sr1	= gr[_REG_SR1];
 		tf->tf_sr2	= gr[_REG_SR2];
 		tf->tf_sr3	= gr[_REG_SR3];
 		tf->tf_sr4	= gr[_REG_SR4];
 		tf->tf_cr26	= gr[_REG_CR26];
-		tf->tf_cr27	= gr[_REG_CR27];
 #endif
 	}
 
 	if ((flags & _UC_FPU) != 0) {
 		struct pcb *pcb = lwp_getpcb(l);

Index: src/sys/arch/hppa/hppa/trap.S
===================================================================
--- src/sys/arch/hppa/hppa/trap.S
+++ src/sys/arch/hppa/hppa/trap.S
@@ -364,10 +364,13 @@
 	mfctl	%pidr4, %arg3
 	stw	%arg2, TF_CR12-TRAPFRAME_SIZEOF(%sr1, %t3)
 	stw	%arg3, TF_CR13-TRAPFRAME_SIZEOF(%sr1, %t3)
 #endif
 
+	mfctl	CR_TLS, %arg0
+	stw	%arg0, TF_CR27-TRAPFRAME_SIZEOF(%sr1, %t3)
+
 #if defined(DDB) || defined(KGDB)
 	/*
 	 * Save v2p translation table pointer
 	 */
 	mfctl	%eirr, %arg0
@@ -505,10 +508,13 @@
 	ldw	TF_CR12(%sr3, %t3), %t1
 	ldw	TF_CR13(%sr3, %t3), %t2
 	mtctl	%t1, %pidr3
 	mtctl	%t2, %pidr4
 #endif
+	ldw	TF_CR27(%sr3, %t3), %t1
+	mtctl	%t1, CR_TLS
+
 	ldw	TF_CR0(%sr3, %t3), %t1
 	mtctl	%t1, %rctr
 
 	ldw	TF_CR30(%sr3, %t3), %t1
 	mtctl	%t1, CR_FPPADDR
@@ -2085,10 +2091,13 @@
 
 	mfsp	%sr7, %t1
 	mfctl	%pidr2, %t2
 	stw	%t1, TF_SR7(%sr3, %t3)
 	stw	%t2, TF_CR9(%sr3, %t3)
+
+	mfctl	CR_TLS, %t1
+	stw	%t1, TF_CR27(%sr3, %t3)
 
 	mtsp	%r0, %sr0
 	mtsp	%r0, %sr1
 	mtsp	%r0, %sr2
 	mtsp	%r0, %sr4

Index: src/sys/arch/hppa/hppa/vm_machdep.c
===================================================================
--- src/sys/arch/hppa/hppa/vm_machdep.c
+++ src/sys/arch/hppa/hppa/vm_machdep.c
@@ -40,10 +40,11 @@
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
 #include <sys/exec.h>
 #include <sys/core.h>
 #include <sys/pool.h>
+#include <sys/cpu.h>
 
 #include <machine/cpufunc.h>
 #include <machine/pmap.h>
 #include <machine/pcb.h>
 
@@ -299,6 +300,16 @@
 	pmap_remove(pmap, kva, kva + len);
 	pmap_update(pmap);
 	uvm_km_free(phys_map, kva, len, UVM_KMF_VAONLY);
 	bp->b_data = bp->b_saveaddr;
 	bp->b_saveaddr = NULL;
+}
+
+int
+cpu_lwp_setprivate(lwp_t *l, void *addr)
+{
+
+	l->l_md.md_regs->tf_cr27 = (u_int)addr;
+	if (l == curlwp)
+		mtctl(addr, CR_TLS);
+	return 0;
 }

Index: src/sys/arch/hppa/include/mcontext.h
===================================================================
--- src/sys/arch/hppa/include/mcontext.h
+++ src/sys/arch/hppa/include/mcontext.h
@@ -54,14 +54,24 @@
 #define	_UC_MACHINE_SET_PC(uc, pc)					\
 do {									\
 	(uc)->uc_mcontext.__gregs[_REG_PCOQH] = (pc);			\
 	(uc)->uc_mcontext.__gregs[_REG_PCOQT] = (pc) + 4;		\
 } while (/*CONSTCOND*/0)
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__tmp;
+
+	__asm volatile("mfctl\t27 /* CR_TLS */, %0" : "=r" (__tmp));
+
+	return __tmp;
+}
 
 #endif /* !__ASSEMBLER__ */
 
 #define	_OFFSETOF_UC_GREGS 40
 
 #define	_UC_SETSTACK	0x00010000
 #define	_UC_CLRSTACK	0x00020000
 
 #endif /* _HPPA_MCONTEXT_H_ */

Index: src/sys/arch/hppa/include/types.h
===================================================================
--- src/sys/arch/hppa/include/types.h
+++ src/sys/arch/hppa/include/types.h
@@ -90,7 +90,10 @@
  * will construct PLABELs for them.  Make them "const char []" instead.
  */
 
 #define	RAS_DECL(name)							\
 extern const char __CONCAT(name,_ras_start[]), __CONCAT(name,_ras_end[])
+
+#define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #endif	/* _HPPA_TYPES_H_ */

Index: src/sys/arch/i386/i386/machdep.c
===================================================================
--- src/sys/arch/i386/i386/machdep.c
+++ src/sys/arch/i386/i386/machdep.c
@@ -1753,10 +1753,13 @@
 	    (void *) gr[_REG_EIP])) != -1)
 		gr[_REG_EIP] = ras_eip;
 
 	*flags |= _UC_CPU;
 
+	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
+	*flags |= _UC_TLSBASE;
+
 	/* Save floating point register context, if any. */
 	if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
 		struct pcb *pcb = lwp_getpcb(l);
 #if NNPX > 0
 
@@ -1842,10 +1845,13 @@
 		tf->tf_eip    = gr[_REG_EIP];
 		tf->tf_cs     = gr[_REG_CS];
 		tf->tf_esp    = gr[_REG_UESP];
 		tf->tf_ss     = gr[_REG_SS];
 	}
+
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
 
 #if NNPX > 0
 	/*
 	 * If we were using the FPU, forget that we were.
 	 */

Index: src/sys/arch/i386/include/mcontext.h
===================================================================
--- src/sys/arch/i386/include/mcontext.h
+++ src/sys/arch/i386/include/mcontext.h
@@ -37,10 +37,11 @@
  * mcontext extensions to handle signal delivery.
  */
 #define _UC_SETSTACK	0x00010000
 #define _UC_CLRSTACK	0x00020000
 #define _UC_VM		0x00040000
+#define	_UC_TLSBASE	0x00080000
 
 /*
  * Layout of mcontext_t according to the System V Application Binary Interface,
  * Intel386(tm) Architecture Processor Supplement, Fourth Edition.
  */  
@@ -94,15 +95,16 @@
 } __fpregset_t;
 
 typedef struct {
 	__gregset_t	__gregs;
 	__fpregset_t	__fpregs;
+	__greg_t	_mc_tlsbase;
 } mcontext_t;
 
 #define _UC_FXSAVE	0x20	/* FP state is in FXSAVE format in XMM space */
 
-#define _UC_MACHINE_PAD	5	/* Padding appended to ucontext_t */
+#define _UC_MACHINE_PAD	4	/* Padding appended to ucontext_t */
 
 #define _UC_UCONTEXT_ALIGN	(~0xf)
 
 #ifdef _KERNEL_OPT
 #include "opt_vm86.h"
@@ -120,7 +122,17 @@
 #endif
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_EIP])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_EAX])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	void *__tmp;
+
+	__asm volatile("movl %%gs:0, %0" : "=r" (__tmp));
+
+	return __tmp;
+}
 
 #endif	/* !_I386_MCONTEXT_H_ */

Index: src/sys/arch/i386/include/types.h
===================================================================
--- src/sys/arch/i386/include/types.h
+++ src/sys/arch/i386/include/types.h
@@ -110,11 +110,12 @@
 #define	__HAVE_OLD_DISKLABEL
 #define __HAVE_ATOMIC64_OPS
 #define	__HAVE_ATOMIC_AS_MEMBAR
 #define	__HAVE_CPU_LWP_SETPRIVATE
 #define	__HAVE_INTR_CONTROL
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #if defined(_KERNEL)
 #define	__HAVE_RAS
 #endif
 
 #endif	/* _I386_MACHTYPES_H_ */

Index: src/sys/arch/m68k/include/mcontext.h
===================================================================
--- src/sys/arch/m68k/include/mcontext.h
+++ src/sys/arch/m68k/include/mcontext.h
@@ -77,11 +77,11 @@
 
 typedef struct {
 	__gregset_t	__gregs;	/* General Register set */
 	__fpregset_t	__fpregs;	/* Floating Point Register set */
 	union {
-		long	__mc_state[202];	/* Only need 308 bytes... */
+		long	__mc_state[201];	/* Only need 308 bytes... */
 #if defined(_KERNEL) || defined(__M68K_MCONTEXT_PRIVATE)
 		struct {
 			/* Rest of the frame. */
 			unsigned int	__mcf_format;
 			unsigned int	__mcf_vector;
@@ -90,19 +90,21 @@
 			union FPF_u1	__mcf_fpf_u1;
 			union FPF_u2	__mcf_fpf_u2;
 		} __mc_frame;
 #endif /* _KERNEL || __M68K_MCONTEXT_PRIVATE */
 	}		__mc_pad;
+	__greg_t	_mc_tlsbase;
 } mcontext_t;
 
 /* Note: no additional padding is to be performed in ucontext_t. */
 
 /* Machine-specific uc_flags value */
 #define _UC_M68K_UC_USER 0x40000000
+#define	_UC_TLSBASE	0x00080000
 
 #define _UC_MACHINE_SP(uc)	((uc)->uc_mcontext.__gregs[_REG_A7])
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_PC])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_D0])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
 
 #endif	/* !_M68K_MCONTEXT_H_ */

Index: src/sys/arch/m68k/m68k/sig_machdep.c
===================================================================
--- src/sys/arch/m68k/m68k/sig_machdep.c
+++ src/sys/arch/m68k/m68k/sig_machdep.c
@@ -259,10 +259,13 @@
 	    (void *) gr[_REG_PC])) != -1)
 		gr[_REG_PC] = ras_pc;
 
 	*flags |= _UC_CPU;
 
+	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
+	*flags |= _UC_TLSBASE;
+
 	/* Save exception frame information. */
 	mcp->__mc_pad.__mc_frame.__mcf_format = format;
 	if (format >= FMT4) {
 		mcp->__mc_pad.__mc_frame.__mcf_vector = frame->f_vector;
 		(void)memcpy(&mcp->__mc_pad.__mc_frame.__mcf_exframe,
@@ -418,10 +421,13 @@
 		 * (from the PCB) when this lwp is given the CPU.
 		 */
 		if (l == curlwp)
 			m68881_restore(fpf);
 	}
+
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
 
 	mutex_enter(l->l_proc->p_lock);
 	if (flags & _UC_SETSTACK)
 		l->l_sigstk.ss_flags |= SS_ONSTACK;
 	if (flags & _UC_CLRSTACK)

Index: src/sys/arch/mips/include/mcontext.h
===================================================================
--- src/sys/arch/mips/include/mcontext.h
+++ src/sys/arch/mips/include/mcontext.h
@@ -123,34 +123,38 @@
 #endif
 
 typedef struct {
 	__gregset_t	__gregs;
 	__fpregset_t	__fpregs;
+	__greg_t	_mc_tlsbase;
 } mcontext_t;
 
 #if defined(_KERNEL) && defined(_LP64)
 typedef	__int32_t	__greg32_t;
 typedef __greg32_t	__gregset32_t[_NGREG];
 
 typedef struct {
 	__gregset32_t		__gregs;
 	struct __fpregset_oabi	__fpregs;
+	__greg_t		_mc_tlsbase;
 } mcontext_o32_t;
 
 typedef struct {
 	__gregset_t		__gregs;
 	struct __fpregset_nabi	__fpregs;
+	__greg_t		_mc_tlsbase;
 } mcontext32_t;
 
 #endif /* _KERNEL && _LP64 */
 
 #endif /* !__ASSEMBLER__ */
 
-#define _UC_MACHINE_PAD	16	/* Padding appended to ucontext_t */
+#define _UC_MACHINE_PAD	15	/* Padding appended to ucontext_t */
 
 #define	_UC_SETSTACK	0x00010000
 #define	_UC_CLRSTACK	0x00020000
+#define	_UC_TLSBASE	0x00040000
 
 #define _UC_MACHINE_SP(uc)	((uc)->uc_mcontext.__gregs[_REG_SP])
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_EPC])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_V0])
 
@@ -159,7 +163,16 @@
 #define _UC_MACHINE32_SP(uc)	_UC_MACHINE_SP(uc)
 #define _UC_MACHINE32_PC(uc)	_UC_MACHINE_PC(uc)
 #define _UC_MACHINE32_INTRV(uc)	_UC_MACHINE_INTRV(uc)
 
 #define	_UC_MACHINE32_SET_PC(uc, pc)	_UC_MACHINE_PC((uc), (pc))
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__tcb;
+
+	__asm volatile(".set push; .set mips32r2; rdhwr %0, $29; .set pop" : "=v"(__tcb));
+	return __tcb;
+}
 
 #endif	/* _MIPS_MCONTEXT_H_ */

Index: src/sys/arch/mips/include/types.h
===================================================================
--- src/sys/arch/mips/include/types.h
+++ src/sys/arch/mips/include/types.h
@@ -133,10 +133,11 @@
 #define	__HAVE_PROCESS_XFPREGS
 #define	__HAVE_CPU_DATA_FIRST
 #ifdef MIPS3_PLUS	/* XXX bogus! */
 #define	__HAVE_CPU_COUNTER
 #endif
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #if !defined(__mips_o32)
 #define	__HAVE_ATOMIC64_OPS
 #endif
 

Index: src/sys/arch/mips/mips/mips_machdep.c
===================================================================
--- src/sys/arch/mips/mips/mips_machdep.c
+++ src/sys/arch/mips/mips/mips_machdep.c
@@ -2065,10 +2065,13 @@
 	    (void *) (intptr_t)gr[_REG_EPC])) != -1)
 		gr[_REG_EPC] = ras_pc;
 
 	*flags |= _UC_CPU;
 
+	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
+	*flags |= _UC_TLSBASE;
+
 	/* Save floating point register context, if any. */
 	if (l->l_md.md_flags & MDP_FPUSED) {
 		struct pcb *pcb;
 		size_t fplen;
 
@@ -2142,10 +2145,13 @@
 		 * proper size of fpreg when copying.
 		 */
 		pcb = lwp_getpcb(l);
 		memcpy(&pcb->pcb_fpregs, &mcp->__fpregs, fplen);
 	}
+
+	if ((flags & _UC_TLSBASE) != 0)
+		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
 
 	mutex_enter(p->p_lock);
 	if (flags & _UC_SETSTACK)
 		l->l_sigstk.ss_flags |= SS_ONSTACK;
 	if (flags & _UC_CLRSTACK)

Index: src/sys/arch/mips/mips/netbsd32_machdep.c
===================================================================
--- src/sys/arch/mips/mips/netbsd32_machdep.c
+++ src/sys/arch/mips/mips/netbsd32_machdep.c
@@ -260,10 +260,12 @@
 	for (i = 0; i < __arraycount(mc.__gregs); i++)
 		mco32->__gregs[i] = mc.__gregs[i];
 	if (*flagsp & _UC_FPU)
 		memcpy(&mco32->__fpregs, &mc.__fpregs,
 		    sizeof(struct fpreg_oabi));
+	mco32->_mc_tlsbase = mc._mc_tlsbase;
+	*flags |= _UC_TLSBASE;
 }
 
 int
 cpu_setmcontext32(struct lwp *l, const mcontext32_t *mc32, unsigned int flags)
 {
@@ -277,10 +279,11 @@
 	for (i = 0; i < __arraycount(mc.__gregs); i++)
 		mc.__gregs[i] = mco32->__gregs[i];
 	if (flags & _UC_FPU)
 		memcpy(&mc.__fpregs, &mco32->__fpregs,
 		    sizeof(struct fpreg_oabi));
+	mc._mc_tlsbase = mco32->_mc_tlsbase;
 	return cpu_setmcontext(l, &mc, flags);
 }
 
 #ifdef COREDUMP
 /*

Index: src/sys/arch/powerpc/include/mcontext.h
===================================================================
--- src/sys/arch/powerpc/include/mcontext.h
+++ src/sys/arch/powerpc/include/mcontext.h
@@ -119,7 +119,15 @@
 #define _UC_MACHINE_SP(uc)	((uc)->uc_mcontext.__gregs[_REG_R1])
 #define _UC_MACHINE_PC(uc)	((uc)->uc_mcontext.__gregs[_REG_PC])
 #define _UC_MACHINE_INTRV(uc)	((uc)->uc_mcontext.__gregs[_REG_R3])
 
 #define	_UC_MACHINE_SET_PC(uc, pc)	_UC_MACHINE_PC(uc) = (pc)
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__tmp __asm__("r2");
+
+	return __tmp;
+}
 
 #endif	/* !_POWERPC_MCONTEXT_H_ */

Index: src/sys/arch/powerpc/include/types.h
===================================================================
--- src/sys/arch/powerpc/include/types.h
+++ src/sys/arch/powerpc/include/types.h
@@ -76,7 +76,8 @@
 #define __HAVE_CPU_LWP_SETPRIVATE
 #define	__HAVE_CPU_DATA_FIRST
 #ifdef _LP64
 #define	__HAVE_ATOMIC64_OPS
 #endif
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #endif	/* _MACHTYPES_H_ */

Index: src/sys/arch/sh3/include/mcontext.h
===================================================================
--- src/sys/arch/sh3/include/mcontext.h
+++ src/sys/arch/sh3/include/mcontext.h
@@ -91,7 +91,16 @@
  * Machine dependent uc_flags
  */
 #define	_UC_SETSTACK		0x10000
 #define	_UC_CLRSTACK		0x20000
 
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__gbr;
+
+	__asm volatile("stc gbr, %0" : "=r" (__gbr));
+
+	return __gbr;
+}
 
 #endif /* !_SH3_MCONTEXT_H_ */

Index: src/sys/arch/sh3/include/types.h
===================================================================
--- src/sys/arch/sh3/include/types.h
+++ src/sys/arch/sh3/include/types.h
@@ -74,7 +74,10 @@
 #define	__HAVE_CPU_DATA_FIRST
 
 #if defined(_KERNEL)
 #define	__HAVE_RAS
 #endif
+
+#define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #endif	/* !_SH3_TYPES_H_ */

Index: src/sys/arch/sh3/sh3/sh3_machdep.c
===================================================================
--- src/sys/arch/sh3/sh3/sh3_machdep.c
+++ src/sys/arch/sh3/sh3/sh3_machdep.c
@@ -86,10 +86,11 @@
 #include <sys/ras.h>
 #include <sys/sa.h>
 #include <sys/savar.h>
 #include <sys/syscallargs.h>
 #include <sys/ucontext.h>
+#include <sys/cpu.h>
 
 #ifdef KGDB
 #include <sys/kgdb.h>
 #ifndef KGDB_DEVNAME
 #define	KGDB_DEVNAME "nodev"
@@ -512,10 +513,12 @@
 		tf->tf_r3     = gr[_REG_R3];
 		tf->tf_r2     = gr[_REG_R2];
 		tf->tf_r1     = gr[_REG_R1];
 		tf->tf_r0     = gr[_REG_R0];
 		tf->tf_r15    = gr[_REG_R15];
+
+		lwp_setprivate(l, (void *)(uintptr_t)gr[_REG_GBR]);
 	}
 
 #if 0
 	/* XXX: FPU context is currently not handled by the kernel. */
 	if (flags & _UC_FPU) {
@@ -583,6 +586,14 @@
 
 #ifndef __lint__
 	goto *(void *)0xa0000000;
 #endif
 	/* NOTREACHED */
+}
+
+int
+cpu_lwp_setprivate(lwp_t *l, void *addr)
+{
+
+	l->l_md.md_regs->tf_gbr = (int)addr;
+	return 0;
 }

Index: src/sys/arch/sparc/include/mcontext.h
===================================================================
--- src/sys/arch/sparc/include/mcontext.h
+++ src/sys/arch/sparc/include/mcontext.h
@@ -157,7 +157,17 @@
 #define	_UC_MACHINE_SET_PC(uc, pc)					\
 do {									\
 	(uc)->uc_mcontext.__gregs[_REG_PC] = (pc);			\
 	(uc)->uc_mcontext.__gregs[_REG_nPC] = (pc) + 4;			\
 } while (/*CONSTCOND*/0)
+
+static inline void *
+__lwp_getprivate_fast(void)
+{
+	register void *__tmp;
+
+	__asm volatile("mov %%g7, %0" : "=r" (__tmp));
+
+	return __tmp;
+}
 
 #endif	/* !_SPARC_MCONTEXT_H_ */

Index: src/sys/arch/sparc/include/types.h
===================================================================
--- src/sys/arch/sparc/include/types.h
+++ src/sys/arch/sparc/include/types.h
@@ -125,7 +125,9 @@
 #if defined(_KERNEL)
 #define __HAVE_RAS
 #endif
 #endif
 
+#define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE___LWP_GETPRIVATE_FAST
 
 #endif	/* _MACHTYPES_H_ */

Index: src/sys/arch/sparc/sparc/machdep.c
===================================================================
--- src/sys/arch/sparc/sparc/machdep.c
+++ src/sys/arch/sparc/sparc/machdep.c
@@ -775,10 +775,12 @@
 		tf->tf_out[3] = r[_REG_O3];
 		tf->tf_out[4] = r[_REG_O4];
 		tf->tf_out[5] = r[_REG_O5];
 		tf->tf_out[6] = r[_REG_O6];
 		tf->tf_out[7] = r[_REG_O7];
+
+		lwp_setprivate(l, (void *)(uintptr_t)r[_REG_G7]);
 	}
 
 #ifdef FPU_CONTEXT
 	if (flags & _UC_FPU) {
 		/*

Index: src/sys/arch/sparc/sparc/vm_machdep.c
===================================================================
--- src/sys/arch/sparc/sparc/vm_machdep.c
+++ src/sys/arch/sparc/sparc/vm_machdep.c
@@ -60,10 +60,11 @@
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/vnode.h>
 #include <sys/simplelock.h>
+#include <sys/cpu.h>
 
 #include <uvm/uvm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
@@ -341,6 +342,16 @@
 
 	pcb->pcb_pc = (int)lwp_setfunc_trampoline - 8;
 	pcb->pcb_sp = (int)rp;
 	pcb->pcb_psr &= ~PSR_CWP;	/* Run in window #0 */
 	pcb->pcb_wim = 1;		/* Fence at window #1 */
+}
+
+int
+cpu_lwp_setprivate(lwp_t *l, void *addr)
+{
+	struct trapframe *tf = l->l_md.md_tf;
+
+	tf->tf_global[7] = (uintptr_t)addr;
+
+	return 0;
 }

Index: src/sys/arch/sparc64/sparc64/netbsd32_machdep.c
===================================================================
--- src/sys/arch/sparc64/sparc64/netbsd32_machdep.c
+++ src/sys/arch/sparc64/sparc64/netbsd32_machdep.c
@@ -1225,10 +1225,12 @@
 		tf->tf_out[4]    = (uint64_t)gr[_REG32_O4];
 		tf->tf_out[5]    = (uint64_t)gr[_REG32_O5];
 		tf->tf_out[6]    = (uint64_t)gr[_REG32_O6];
 		tf->tf_out[7]    = (uint64_t)gr[_REG32_O7];
 		/* %asi restored above; %fprs not yet supported. */
+
+		lwp_setprivate(l, (void *)(uintptr_t)gr[_REG_G7]);
 
 		/* XXX mcp->__gwins */
 	}
 
 	/* Restore floating point register context, if any. */

Index: src/sys/arch/sparc64/sparc64/vm_machdep.c
===================================================================
--- src/sys/arch/sparc64/sparc64/vm_machdep.c
+++ src/sys/arch/sparc64/sparc64/vm_machdep.c
@@ -59,10 +59,11 @@
 #include <sys/proc.h>
 #include <sys/core.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/vnode.h>
+#include <sys/cpu.h>
 
 #include <uvm/uvm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
@@ -351,6 +352,16 @@
 {
 	struct fpstate64 *fs;
 
 	if ((fs = l->l_md.md_fpstate) != NULL)
 		pool_cache_put(fpstate_cache, fs);
+}
+
+int
+cpu_lwp_setprivate(lwp_t *l, void *addr)
+{
+	struct trapframe *tf = l->l_md.md_tf;
+
+	tf->tf_global[7] = (uintptr_t)addr;
+
+	return 0;
 }

Reply via email to