Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-13 Thread Anton Blanchard
Hi,

Here is another instruction trace from a kernel context switch trace.
Quite a lot of register and CR save/restore code.

Regards,
Anton

c02943d8 fsnotify+0x8 mfcrr12
c02943dc fsnotify+0xc std r20,-96(r1)
c02943e0 fsnotify+0x10 std r21,-88(r1)
c02943e4 fsnotify+0x14 rldicl. r9,r4,63,63
c02943e8 fsnotify+0x18 std r22,-80(r1)
c02943ec fsnotify+0x1c mflrr0
c02943f0 fsnotify+0x20 std r24,-64(r1)
c02943f4 fsnotify+0x24 std r25,-56(r1)
c02943f8 fsnotify+0x28 std r26,-48(r1)
c02943fc fsnotify+0x2c std r27,-40(r1)
c0294400 fsnotify+0x30 std r31,-8(r1)
c0294404 fsnotify+0x34 std r15,-136(r1)
c0294408 fsnotify+0x38 stw r12,8(r1)
c029440c fsnotify+0x3c std r16,-128(r1)
c0294410 fsnotify+0x40 mcrfcr4,cr0
c0294414 fsnotify+0x44 std r0,16(r1)
c0294418 fsnotify+0x48 std r17,-120(r1)
c029441c fsnotify+0x4c std r18,-112(r1)
c0294420 fsnotify+0x50 std r19,-104(r1)
c0294424 fsnotify+0x54 std r23,-72(r1)
c0294428 fsnotify+0x58 std r28,-32(r1)
c029442c fsnotify+0x5c std r29,-24(r1)
c0294430 fsnotify+0x60 std r30,-16(r1)
c0294434 fsnotify+0x64 stdur1,-272(r1)
c0294438 fsnotify+0x68 cmpwi   cr7,r6,1
c029443c fsnotify+0x6c rlwinm  r31,r4,4,1,31
c0294440 fsnotify+0x70 li  r9,0
c029 fsnotify+0x74 rotlwi  r31,r31,28
c0294448 fsnotify+0x78 mr  r24,r6
c029444c fsnotify+0x7c mr  r26,r4
c0294450 fsnotify+0x80 mr  r25,r3
c0294454 fsnotify+0x84 mr  r22,r5
c0294458 fsnotify+0x88 mr  r21,r7
c029445c fsnotify+0x8c mr  r20,r8
c0294460 fsnotify+0x90 std r9,120(r1)
c0294464 fsnotify+0x94 std r9,112(r1)
c0294468 fsnotify+0x98 clrldi  r27,r31,32
c029446c fsnotify+0x9c beq cr7,c0294888 fsnotify+0x4b8 
c0294888 fsnotify+0x4b8 ld  r29,0(r5)
c029488c fsnotify+0x4bc addir29,r29,-32
c0294890 fsnotify+0x4c0 beq c0294478 fsnotify+0xa8 
c0294478 fsnotify+0xa8 lwz r9,516(r25)
c029447c fsnotify+0xac and r10,r9,r31
c0294480 fsnotify+0xb0 cmpwi   r10,0
c0294484 fsnotify+0xb4 bne c02945d0 fsnotify+0x200 
c0294488 fsnotify+0xb8 cmpdi   cr7,r29,0
c029448c fsnotify+0xbc beq cr7,c02948c4 fsnotify+0x4f4 
c0294490 fsnotify+0xc0 lwz r9,264(r29)
c0294494 fsnotify+0xc4 and r10,r9,r31
c0294498 fsnotify+0xc8 cmpwi   r10,0
c029449c fsnotify+0xcc beq c02948c4 fsnotify+0x4f4 
c02948c4 fsnotify+0x4f4 li  r3,0
c02948c8 fsnotify+0x4f8 b   c02947cc fsnotify+0x3fc 
c02947cc fsnotify+0x3fc addir1,r1,272
c02947d0 fsnotify+0x400 ld  r0,16(r1)
c02947d4 fsnotify+0x404 lwz r12,8(r1)
c02947d8 fsnotify+0x408 ld  r15,-136(r1)
c02947dc fsnotify+0x40c ld  r16,-128(r1)
c02947e0 fsnotify+0x410 mtlrr0
c02947e4 fsnotify+0x414 ld  r17,-120(r1)
c02947e8 fsnotify+0x418 ld  r18,-112(r1)
c02947ec fsnotify+0x41c mtocrf  32,r12
c02947f0 fsnotify+0x420 mtocrf  16,r12
c02947f4 fsnotify+0x424 mtocrf  8,r12
c02947f8 fsnotify+0x428 ld  r19,-104(r1)
c02947fc fsnotify+0x42c ld  r20,-96(r1)
c0294800 fsnotify+0x430 ld  r21,-88(r1)
c0294804 fsnotify+0x434 ld  r22,-80(r1)
c0294808 fsnotify+0x438 ld  r23,-72(r1)
c029480c fsnotify+0x43c ld  r24,-64(r1)
c0294810 fsnotify+0x440 ld  r25,-56(r1)
c0294814 fsnotify+0x444 ld  r26,-48(r1)
c0294818 fsnotify+0x448 ld  r27,-40(r1)
c029481c fsnotify+0x44c ld  r28,-32(r1)
c0294820 fsnotify+0x450 ld  r29,-24(r1)
c0294824 fsnotify+0x454 ld  r30,-16(r1)
c0294828 fsnotify+0x458 ld  r31,-8(r1)
c029482c fsnotify+0x45c blr
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-13 Thread Michael Ellerman
On Wed, 2015-08-05 at 14:03 +1000, Anton Blanchard wrote:
 Hi,
 
 While looking at traces of kernel workloads, I noticed places where gcc
 used a large number of non volatiles. Some of these functions
 did very little work, and we spent most of our time saving the
 non volatiles to the stack and reading them back.
 
 It made me wonder if we have the right ratio of volatile to non
 volatile GPRs. Since the kernel is completely self contained, we could
 potentially change that ratio.
 
 Attached is a quick hack to gcc and the kernel to decrease the number
 of non volatile GPRs to 8. I'm not sure if this is a good idea (and if
 the volatile to non volatile ratio is right), but this gives us
 something to play with.

OK, interesting idea. Can't say I'd ever though of that.

I'm thinking we'd want some pretty solid analysis of the resulting code-gen and
real world perf before we made a switch like that.

Presumably it's going to hurt our null syscall, due to the added save/restores,
but hopefully help with paths that do actual work.

If the caller is actually using the non-volatiles then presumably it will be a
wash, because the caller will have to do the save anyway. Though maybe it would
still be a win because the caller can do the saves  restores when it needs to
rather than all in a block.

I'm also not clear on how it would affect folks who build modules separate from
the kernel. We'd have to make sure they had the right GCC, or things would go
badly wrong, unless it can be done with command line flags? I don't know how
much we care about that but distros presumably do.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-11 Thread Segher Boessenkool
On Mon, Aug 10, 2015 at 02:52:28PM +1000, Anton Blanchard wrote:
 Hi Bill, Segher,
 
  I agree with Segher.  We already know we have opportunities to do a
  better job with shrink-wrapping (pushing this kind of useless
  activity down past early exits), so having examples of code to look
  at to improve this would be useful.
 
 I'll look out for specific examples. I noticed this one today when
 analysing malloc(8). It is an instruction trace of _int_malloc().
 
 The overall function is pretty huge, which I assume leads to gcc using
 so many non volatiles.

That is one part of it; also GCC deals out volatiles too generously.

 Perhaps in this case we should separate out the
 slow path into another function marked noinline.

Or GCC could do that, effectively at least.

 This is just an upstream glibc build, but I'll send the preprocessed
 source off list.

Thanks :-)

[snip code]

After the prologue there are 46 insns executed before the epilogue.
Many of those are conditional branches (that are not executed); it is
all fall-through until it jumps to the tail (the few insns before
the epilogue).  GCC knows how to duplicate a tail so that it can do
shrink-wrapping (the original tail needs to be followed by an epilogue,
the duplicated one does not want one); but it can only do it in very
simple cases (one basic block or at least no control flow), and that
is not the case here.  We need to handle more generic tails.

This seems related to (if not the same as!) http://gcc.gnu.org/PR51982.


Segher
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-11 Thread Segher Boessenkool
On Tue, Aug 11, 2015 at 03:08:29PM -0500, Segher Boessenkool wrote:
 [snip code]
 
 After the prologue there are 46 insns executed before the epilogue.
 Many of those are conditional branches (that are not executed); it is
 all fall-through until it jumps to the tail (the few insns before
 the epilogue).  GCC knows how to duplicate a tail so that it can do
 shrink-wrapping (the original tail needs to be followed by an epilogue,
 the duplicated one does not want one); but it can only do it in very
 simple cases (one basic block or at least no control flow), and that
 is not the case here.  We need to handle more generic tails.

And never mind the elephant in the room: the fastpath instructions
already use a few non-volatile registers, and the shrink-wrap pass
(which runs after register allocation) cannot fix that.  Ugh.

 This seems related to (if not the same as!) http://gcc.gnu.org/PR51982.

This has that same problem, too.


Segher
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-09 Thread Anton Blanchard
Hi Bill, Segher,

 I agree with Segher.  We already know we have opportunities to do a
 better job with shrink-wrapping (pushing this kind of useless
 activity down past early exits), so having examples of code to look
 at to improve this would be useful.

I'll look out for specific examples. I noticed this one today when
analysing malloc(8). It is an instruction trace of _int_malloc().

The overall function is pretty huge, which I assume leads to gcc using
so many non volatiles. Perhaps in this case we should separate out the
slow path into another function marked noinline.

This is just an upstream glibc build, but I'll send the preprocessed
source off list.

Anton
--

0x410d538   mflrr0
0x410d53c   li  r9,-65
0x410d540   std r14,-144(r1) # 0x000fff00efe0
0x410d544   std r15,-136(r1) # 0x000fff00efe8
0x410d548   cmpld   cr7,r4,r9
0x410d54c   std r16,-128(r1) # 0x000fff00eff0
0x410d550   std r17,-120(r1) # 0x000fff00eff8
0x410d554   std r18,-112(r1) # 0x000fff00f000
0x410d558   std r19,-104(r1) # 0x000fff00f008
0x410d55c   std r20,-96(r1)  # 0x000fff00f010
0x410d560   std r21,-88(r1)  # 0x000fff00f018
0x410d564   std r22,-80(r1)  # 0x000fff00f020
0x410d568   std r23,-72(r1)  # 0x000fff00f028
0x410d56c   std r0,16(r1)# 0x000fff00f080
0x410d570   std r24,-64(r1)  # 0x000fff00f030
0x410d574   std r25,-56(r1)  # 0x000fff00f038
0x410d578   std r26,-48(r1)  # 0x000fff00f040
0x410d57c   std r27,-40(r1)  # 0x000fff00f048
0x410d580   std r28,-32(r1)  # 0x000fff00f050
0x410d584   std r29,-24(r1)  # 0x000fff00f058
0x410d588   std r30,-16(r1)  # 0x000fff00f060
0x410d58c   std r31,-8(r1)   # 0x000fff00f068
0x410d590   stdur1,-224(r1)  # 0x000fff00ef90
0x410d594   bgt cr7,0x410dda4
0x410d598   addir9,r4,23
0x410d59c   li  r16,32
0x410d5a0   cmpldi  cr7,r9,31
0x410d5a4   bgt cr7,0x410d700
0x410d5a8   cmpdi   cr7,r3,0
0x410d5ac   mr  r14,r3
0x410d5b0   mr  r30,r4
0x410d5b4   beq cr7,0x410ddc0
0x410d5b8   nop
0x410d5bc   ld  r9,-19136(r2)# 0x04222840
0x410d5c0   rlwinm  r29,r16,28,4,31
0x410d5c4   cmpld   cr7,r16,r9
0x410d5c8   bgt cr7,0x410d650
0x410d5cc   addir6,r29,-2
0x410d5d0   clrldi  r9,r6,32
0x410d5d4   rldicr  r10,r9,3,60
0x410d5d8   addir7,r9,1
0x410d5dc   add r10,r3,r10
0x410d5e0   rldicr  r7,r7,3,60
0x410d5e4   add r7,r3,r7
0x410d5e8   ld  r9,8(r10)# 0x04220ce0
0x410d5ec   cmpdi   cr7,r9,0
0x410d5f0   beq cr7,0x410d650
0x410d5f4   ld  r10,16(r9)   # 0x10030010
0x410d5f8   ldarx   r15,0,r7,1   # 0x04220ce0
0x410d5fc   cmpdr15,r9
0x410d600   bne 0x410d60c
0x410d604   stdcx.  r10,0,r7 # 0x04220ce0
0x410d608   bne-0x410d5f8
0x410d60c   isync
0x410d610   cmpld   cr7,r15,r9
0x410d614   bne cr7,0x410d648
0x410d618   b   0x410da40
0x410da40   ld  r9,8(r15)# 0x10030008
0x410da44   rlwinm  r9,r9,28,4,31
0x410da48   addir9,r9,-2
0x410da4c   cmplw   cr7,r9,r6
0x410da50   bne cr7,0x410de08
0x410da54   nop
0x410da58   addir31,r15,16
0x410da5c   lwa r9,-19080(r2)# 0x04222878
0x410da60   cmpdi   cr7,r9,0
0x410da64   bne cr7,0x410d6e4
0x410da68   addir1,r1,224
0x410da6c   mr  r3,r31
0x410da70   ld  r0,16(r1)# 0x000fff00f080
0x410da74   ld  r14,-144(r1) # 0x000fff00efe0
0x410da78   ld  r15,-136(r1) # 0x000fff00efe8
0x410da7c   ld  r16,-128(r1) # 0x000fff00eff0
0x410da80   ld  r17,-120(r1) # 0x000fff00eff8
0x410da84   ld  r18,-112(r1) # 0x000fff00f000
0x410da88   ld  r19,-104(r1) # 0x000fff00f008
0x410da8c   ld  r20,-96(r1)  # 0x000fff00f010
0x410da90   ld  r21,-88(r1)  # 0x000fff00f018
0x410da94   ld  r22,-80(r1)  # 0x000fff00f020
0x410da98   ld  r23,-72(r1)  # 0x000fff00f028
0x410da9c   ld  r24,-64(r1)  # 0x000fff00f030
0x410daa0   mtlrr0
0x410da70   ld  r0,16(r1)# 0x000fff00f080
0x410da74   ld  r14,-144(r1) # 0x000fff00efe0
0x410da78   ld  r15,-136(r1) # 0x000fff00efe8
0x410da7c   ld  r16,-128(r1) # 0x000fff00eff0
0x410da80   ld  r17,-120(r1) # 0x000fff00eff8
0x410da84   ld  r18,-112(r1) # 0x000fff00f000
0x410da88   ld  r19,-104(r1) # 0x000fff00f008
0x410da8c   ld  r20,-96(r1)  

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-07 Thread Bill Schmidt

I agree with Segher.  We already know we have opportunities to do a better
job with shrink-wrapping (pushing this kind of useless activity down past
early exits), so having examples of code to look at to improve this would
be useful.

-- Bill

Bill Schmidt, Ph.D.
Linux on Power Toolchain
IBM Linux Technology Center
wschm...@us.ibm.com   (507) 319-6873




From:   Segher Boessenkool seg...@kernel.crashing.org
To: Anton Blanchard an...@samba.org
Cc: linuxppc-dev@lists.ozlabs.org, Michael
Gschwind/Watson/IBM@IBMUS, Alan Modra amo...@gmail.com, Bill
Schmidt/Rochester/IBM@IBMUS, Ulrich Weigand
ulrich.weig...@de.ibm.com, pau...@samba.org
Date:   08/05/2015 06:20 AM
Subject:Re: RFC: Reducing the number of non volatile GPRs in the ppc64
kernel



Hi Anton,

On Wed, Aug 05, 2015 at 02:03:00PM +1000, Anton Blanchard wrote:
 While looking at traces of kernel workloads, I noticed places where gcc
 used a large number of non volatiles. Some of these functions
 did very little work, and we spent most of our time saving the
 non volatiles to the stack and reading them back.

That is something that should be fixed in GCC -- do you have an example
of such a function?

 It made me wonder if we have the right ratio of volatile to non
 volatile GPRs. Since the kernel is completely self contained, we could
 potentially change that ratio.

 Attached is a quick hack to gcc and the kernel to decrease the number
 of non volatile GPRs to 8. I'm not sure if this is a good idea (and if
 the volatile to non volatile ratio is right), but this gives us
 something to play with.

Instead of the GCC hack you can add a bunch of -fcall-used-r14 etc.
options; does that not work for you?


Segher


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-04 Thread Segher Boessenkool
Hi Anton,

On Wed, Aug 05, 2015 at 02:03:00PM +1000, Anton Blanchard wrote:
 While looking at traces of kernel workloads, I noticed places where gcc
 used a large number of non volatiles. Some of these functions
 did very little work, and we spent most of our time saving the
 non volatiles to the stack and reading them back.

That is something that should be fixed in GCC -- do you have an example
of such a function?

 It made me wonder if we have the right ratio of volatile to non
 volatile GPRs. Since the kernel is completely self contained, we could
 potentially change that ratio.
 
 Attached is a quick hack to gcc and the kernel to decrease the number
 of non volatile GPRs to 8. I'm not sure if this is a good idea (and if
 the volatile to non volatile ratio is right), but this gives us
 something to play with.

Instead of the GCC hack you can add a bunch of -fcall-used-r14 etc.
options; does that not work for you?


Segher
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RFC: Reducing the number of non volatile GPRs in the ppc64 kernel

2015-08-04 Thread Anton Blanchard
Hi,

While looking at traces of kernel workloads, I noticed places where gcc
used a large number of non volatiles. Some of these functions
did very little work, and we spent most of our time saving the
non volatiles to the stack and reading them back.

It made me wonder if we have the right ratio of volatile to non
volatile GPRs. Since the kernel is completely self contained, we could
potentially change that ratio.

Attached is a quick hack to gcc and the kernel to decrease the number
of non volatile GPRs to 8. I'm not sure if this is a good idea (and if
the volatile to non volatile ratio is right), but this gives us
something to play with.

Anton powerpc: Reduce the number of non volatiles GPRs to 8

This requires a hacked gcc.

Signed-off-by: Anton Blanchard an...@samba.org
--

Index: linux.junk/arch/powerpc/include/asm/exception-64s.h
===
--- linux.junk.orig/arch/powerpc/include/asm/exception-64s.h
+++ linux.junk/arch/powerpc/include/asm/exception-64s.h
@@ -336,6 +336,7 @@ do_kvm_##n:\
 	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
 	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe   */ \
 	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
+	SAVE_10GPRS(14, r1);		   \
 	mflr	r9;			/* Get LR, later save to stack	*/ \
 	ld	r2,PACATOC(r13);	/* get kernel TOC into r2	*/ \
 	std	r9,_LINK(r1);		   \
Index: linux.junk/arch/powerpc/include/asm/ppc_asm.h
===
--- linux.junk.orig/arch/powerpc/include/asm/ppc_asm.h
+++ linux.junk/arch/powerpc/include/asm/ppc_asm.h
@@ -77,8 +77,8 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLP
 #ifdef __powerpc64__
 #define SAVE_GPR(n, base)	std	n,GPR0+8*(n)(base)
 #define REST_GPR(n, base)	ld	n,GPR0+8*(n)(base)
-#define SAVE_NVGPRS(base)	SAVE_8GPRS(14, base); SAVE_10GPRS(22, base)
-#define REST_NVGPRS(base)	REST_8GPRS(14, base); REST_10GPRS(22, base)
+#define SAVE_NVGPRS(base)	SAVE_8GPRS(24, base)
+#define REST_NVGPRS(base)	REST_8GPRS(24, base)
 #else
 #define SAVE_GPR(n, base)	stw	n,GPR0+4*(n)(base)
 #define REST_GPR(n, base)	lwz	n,GPR0+4*(n)(base)
Index: linux.junk/arch/powerpc/kernel/asm-offsets.c
===
--- linux.junk.orig/arch/powerpc/kernel/asm-offsets.c
+++ linux.junk/arch/powerpc/kernel/asm-offsets.c
@@ -289,7 +289,6 @@ int main(void)
 	DEFINE(GPR11, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[11]));
 	DEFINE(GPR12, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[12]));
 	DEFINE(GPR13, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[13]));
-#ifndef CONFIG_PPC64
 	DEFINE(GPR14, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[14]));
 	DEFINE(GPR15, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[15]));
 	DEFINE(GPR16, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[16]));
@@ -308,7 +307,6 @@ int main(void)
 	DEFINE(GPR29, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[29]));
 	DEFINE(GPR30, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[30]));
 	DEFINE(GPR31, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[31]));
-#endif /* CONFIG_PPC64 */
 	/*
 	 * Note: these symbols include _ because they overlap with special
 	 * register names
Index: linux.junk/arch/powerpc/kernel/entry_64.S
===
--- linux.junk.orig/arch/powerpc/kernel/entry_64.S
+++ linux.junk/arch/powerpc/kernel/entry_64.S
@@ -86,6 +86,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	std	r11,_XER(r1)
 	std	r11,_CTR(r1)
 	std	r9,GPR13(r1)
+
+	std	r14,GPR14(r1)
+	std	r15,GPR15(r1)
+	std	r16,GPR16(r1)
+	std	r17,GPR17(r1)
+	std	r18,GPR18(r1)
+	std	r19,GPR19(r1)
+	std	r20,GPR20(r1)
+	std	r21,GPR21(r1)
+	std	r22,GPR22(r1)
+	std	r23,GPR23(r1)
+
 	mflr	r10
 	/*
 	 * This clears CR0.SO (bit 28), which is the error indication on
@@ -112,6 +124,7 @@ BEGIN_FW_FTR_SECTION
 	cmpd	cr1,r11,r10
 	beq+	cr1,33f
 	bl	accumulate_stolen_time
+	trap
 	REST_GPR(0,r1)
 	REST_4GPRS(3,r1)
 	REST_2GPRS(7,r1)
@@ -225,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECK
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
 	HMT_MEDIUM_LOW_HAS_PPR
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
-1:	ld	r2,GPR2(r1)
+1:
+	REST_10GPRS(14, r1)
+	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
 	mtlr	r4
 	mtcr	r5
@@ -405,10 +420,10 @@ _GLOBAL(ret_from_fork)
 _GLOBAL(ret_from_kernel_thread)
 	bl	schedule_tail
 	REST_NVGPRS(r1)
-	mtlr	r14
-	mr	r3,r15
+	mtlr	r24
+	mr	r3,r25
 #if defined(_CALL_ELF)  _CALL_ELF == 2
-	mr	r12,r14
+	mr	r12,r24
 #endif
 	blrl
 	li	r3,0
@@ -540,8 +555,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEG
 	mtcrf	0xFF,r6
 
 	/* r3-r13 are destroyed -- Cort */
-	REST_8GPRS(14, r1)
-	REST_10GPRS(22, r1)
+	REST_8GPRS(24, r1)
 
 	/* convert old thread to its task_struct for return value */
 	addi	r3,r3,-THREAD
@@ -771,6 +785,7 @@ fast_exception_return:
 	mtspr	SPRN_XER,r4
 
 	REST_8GPRS(5, r1)
+	REST_10GPRS(14, r1)
 
 	andi.	r0,r3,MSR_RI
 	beq-	unrecov_restore
Index: