Re: [Xen-devel] [PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
On Fri, Oct 20, 2017 at 1:28 AM, Ard Biesheuvelwrote: > On 20 October 2017 at 09:24, Ingo Molnar wrote: >> >> * Thomas Garnier wrote: >> >>> Change the assembly code to use only relative references of symbols for the >>> kernel to be PIE compatible. >>> >>> Position Independent Executable (PIE) support will allow to extended the >>> KASLR randomization range below the -2G memory limit. >> >>> diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S >>> b/arch/x86/crypto/aes-x86_64-asm_64.S >>> index 8739cf7795de..86fa068e5e81 100644 >>> --- a/arch/x86/crypto/aes-x86_64-asm_64.S >>> +++ b/arch/x86/crypto/aes-x86_64-asm_64.S >>> @@ -48,8 +48,12 @@ >>> #define R10 %r10 >>> #define R11 %r11 >>> >>> +/* Hold global for PIE suport */ >>> +#define RBASE%r12 >>> + >>> #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ >>> ENTRY(FUNC);\ >>> + pushq RBASE; \ >>> movqr1,r2; \ >>> leaqKEY+48(r8),r9; \ >>> movqr10,r11;\ >>> @@ -74,54 +78,63 @@ >>> movlr6 ## E,4(r9); \ >>> movlr7 ## E,8(r9); \ >>> movlr8 ## E,12(r9); \ >>> + popqRBASE; \ >>> ret;\ >>> ENDPROC(FUNC); >>> >>> +#define round_mov(tab_off, reg_i, reg_o) \ >>> + leaqtab_off(%rip), RBASE; \ >>> + movl(RBASE,reg_i,4), reg_o; >>> + >>> +#define round_xor(tab_off, reg_i, reg_o) \ >>> + leaqtab_off(%rip), RBASE; \ >>> + xorl(RBASE,reg_i,4), reg_o; >>> + >>> #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ >>> movzbl r2 ## H,r5 ## E;\ >>> movzbl r2 ## L,r6 ## E;\ >>> - movlTAB+1024(,r5,4),r5 ## E;\ >>> + round_mov(TAB+1024, r5, r5 ## E)\ >>> movwr4 ## X,r2 ## X;\ >>> - movlTAB(,r6,4),r6 ## E; \ >>> + round_mov(TAB, r6, r6 ## E) \ >>> roll$16,r2 ## E;\ >>> shrl$16,r4 ## E;\ >>> movzbl r4 ## L,r7 ## E;\ >>> movzbl r4 ## H,r4 ## E;\ >>> xorlOFFSET(r8),ra ## E; \ >>> xorlOFFSET+4(r8),rb ## E; \ >>> - xorlTAB+3072(,r4,4),r5 ## E;\ >>> - xorlTAB+2048(,r7,4),r6 ## E;\ >>> + round_xor(TAB+3072, r4, r5 ## E)\ >>> + round_xor(TAB+2048, r7, r6 ## E)\ >>> movzbl r1 ## L,r7 ## E;\ >>> movzbl r1 ## H,r4 ## E;\ >>> - movlTAB+1024(,r4,4),r4 ## E;\ >>> + round_mov(TAB+1024, r4, r4 ## E)\ >>> movwr3 ## X,r1 ## X;\ >>> roll$16,r1 ## E;\ >>> shrl$16,r3 ## E;\ >>> - xorlTAB(,r7,4),r5 ## E; \ >>> + round_xor(TAB, r7, r5 ## E) \ >>> movzbl r3 ## L,r7 ## E;\ >>> movzbl r3 ## H,r3 ## E;\ >>> - xorlTAB+3072(,r3,4),r4 ## E;\ >>> - xorlTAB+2048(,r7,4),r5 ## E;\ >>> + round_xor(TAB+3072, r3, r4 ## E)\ >>> + round_xor(TAB+2048, r7, r5 ## E)\ >>> movzbl r1 ## L,r7 ## E;\ >>> movzbl r1 ## H,r3 ## E;\ >>> shrl$16,r1 ## E;\ >>> - xorlTAB+3072(,r3,4),r6 ## E;\ >>> - movlTAB+2048(,r7,4),r3 ## E;\ >>> + round_xor(TAB+3072, r3, r6 ## E)\ >>> + round_mov(TAB+2048, r7, r3 ## E)\ >>> movzbl r1 ## L,r7 ## E;\ >>> movzbl r1 ## H,r1 ## E;\ >>> - xorlTAB+1024(,r1,4),r6 ## E;\ >>> - xorlTAB(,r7,4),r3 ## E; \ >>> + round_xor(TAB+1024, r1, r6 ## E)\ >>> + round_xor(TAB, r7, r3 ## E) \ >>> movzbl r2 ## H,r1 ## E;\ >>> movzbl r2 ## L,r7 ## E;\ >>> shrl$16,r2 ## E;\ >>> - xorlTAB+3072(,r1,4),r3 ## E;\ >>> - xorlTAB+2048(,r7,4),r4 ## E;\ >>> + round_xor(TAB+3072, r1, r3 ## E)\ >>> + round_xor(TAB+2048, r7, r4 ## E)\ >>> movzbl r2 ## H,r1 ## E;\ >>> movzbl r2 ## L,r2 ## E;\ >>> xorlOFFSET+8(r8),rc ## E; \ >>> xorlOFFSET+12(r8),rd ## E; \ >>> - xorlTAB+1024(,r1,4),r3 ## E;\ >>> - xorlTAB(,r2,4),r4 ## E; >>> + round_xor(TAB+1024, r1, r3 ## E)\ >>> + round_xor(TAB, r2, r4 ## E) >> >> This appears to be adding unconditional overhead to a function that was >> moved to >> assembly to improve its performance. >> It adds couple extra instructions, how much overhead it creates is hard for me to tell. It would increase the code complexity if everything is ifdef. > > I did some benchmarking on this code a while ago and, interestingly, > it was slower than the generic C implementation (on a Pentium E2200), > so we may want to consider whether we still need this driver in the > first place. Interesting. -- Thomas ___ Xen-devel mailing list
Re: [Xen-devel] [PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
On 20 October 2017 at 09:24, Ingo Molnarwrote: > > * Thomas Garnier wrote: > >> Change the assembly code to use only relative references of symbols for the >> kernel to be PIE compatible. >> >> Position Independent Executable (PIE) support will allow to extended the >> KASLR randomization range below the -2G memory limit. > >> diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S >> b/arch/x86/crypto/aes-x86_64-asm_64.S >> index 8739cf7795de..86fa068e5e81 100644 >> --- a/arch/x86/crypto/aes-x86_64-asm_64.S >> +++ b/arch/x86/crypto/aes-x86_64-asm_64.S >> @@ -48,8 +48,12 @@ >> #define R10 %r10 >> #define R11 %r11 >> >> +/* Hold global for PIE suport */ >> +#define RBASE%r12 >> + >> #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ >> ENTRY(FUNC);\ >> + pushq RBASE; \ >> movqr1,r2; \ >> leaqKEY+48(r8),r9; \ >> movqr10,r11;\ >> @@ -74,54 +78,63 @@ >> movlr6 ## E,4(r9); \ >> movlr7 ## E,8(r9); \ >> movlr8 ## E,12(r9); \ >> + popqRBASE; \ >> ret;\ >> ENDPROC(FUNC); >> >> +#define round_mov(tab_off, reg_i, reg_o) \ >> + leaqtab_off(%rip), RBASE; \ >> + movl(RBASE,reg_i,4), reg_o; >> + >> +#define round_xor(tab_off, reg_i, reg_o) \ >> + leaqtab_off(%rip), RBASE; \ >> + xorl(RBASE,reg_i,4), reg_o; >> + >> #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ >> movzbl r2 ## H,r5 ## E;\ >> movzbl r2 ## L,r6 ## E;\ >> - movlTAB+1024(,r5,4),r5 ## E;\ >> + round_mov(TAB+1024, r5, r5 ## E)\ >> movwr4 ## X,r2 ## X;\ >> - movlTAB(,r6,4),r6 ## E; \ >> + round_mov(TAB, r6, r6 ## E) \ >> roll$16,r2 ## E;\ >> shrl$16,r4 ## E;\ >> movzbl r4 ## L,r7 ## E;\ >> movzbl r4 ## H,r4 ## E;\ >> xorlOFFSET(r8),ra ## E; \ >> xorlOFFSET+4(r8),rb ## E; \ >> - xorlTAB+3072(,r4,4),r5 ## E;\ >> - xorlTAB+2048(,r7,4),r6 ## E;\ >> + round_xor(TAB+3072, r4, r5 ## E)\ >> + round_xor(TAB+2048, r7, r6 ## E)\ >> movzbl r1 ## L,r7 ## E;\ >> movzbl r1 ## H,r4 ## E;\ >> - movlTAB+1024(,r4,4),r4 ## E;\ >> + round_mov(TAB+1024, r4, r4 ## E)\ >> movwr3 ## X,r1 ## X;\ >> roll$16,r1 ## E;\ >> shrl$16,r3 ## E;\ >> - xorlTAB(,r7,4),r5 ## E; \ >> + round_xor(TAB, r7, r5 ## E) \ >> movzbl r3 ## L,r7 ## E;\ >> movzbl r3 ## H,r3 ## E;\ >> - xorlTAB+3072(,r3,4),r4 ## E;\ >> - xorlTAB+2048(,r7,4),r5 ## E;\ >> + round_xor(TAB+3072, r3, r4 ## E)\ >> + round_xor(TAB+2048, r7, r5 ## E)\ >> movzbl r1 ## L,r7 ## E;\ >> movzbl r1 ## H,r3 ## E;\ >> shrl$16,r1 ## E;\ >> - xorlTAB+3072(,r3,4),r6 ## E;\ >> - movlTAB+2048(,r7,4),r3 ## E;\ >> + round_xor(TAB+3072, r3, r6 ## E)\ >> + round_mov(TAB+2048, r7, r3 ## E)\ >> movzbl r1 ## L,r7 ## E;\ >> movzbl r1 ## H,r1 ## E;\ >> - xorlTAB+1024(,r1,4),r6 ## E;\ >> - xorlTAB(,r7,4),r3 ## E; \ >> + round_xor(TAB+1024, r1, r6 ## E)\ >> + round_xor(TAB, r7, r3 ## E) \ >> movzbl r2 ## H,r1 ## E;\ >> movzbl r2 ## L,r7 ## E;\ >> shrl$16,r2 ## E;\ >> - xorlTAB+3072(,r1,4),r3 ## E;\ >> - xorlTAB+2048(,r7,4),r4 ## E;\ >> + round_xor(TAB+3072, r1, r3 ## E)\ >> + round_xor(TAB+2048, r7, r4 ## E)\ >> movzbl r2 ## H,r1 ## E;\ >> movzbl r2 ## L,r2 ## E;\ >> xorlOFFSET+8(r8),rc ## E; \ >> xorlOFFSET+12(r8),rd ## E; \ >> - xorlTAB+1024(,r1,4),r3 ## E;\ >> - xorlTAB(,r2,4),r4 ## E; >> + round_xor(TAB+1024, r1, r3 ## E)\ >> + round_xor(TAB, r2, r4 ## E) > > This appears to be adding unconditional overhead to a function that was moved > to > assembly to improve its performance. > I did some benchmarking on this code a while ago and, interestingly, it was slower than the generic C implementation (on a Pentium E2200), so we may want to consider whether we still need this driver in the first place. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
* Thomas Garnierwrote: > Change the assembly code to use only relative references of symbols for the > kernel to be PIE compatible. > > Position Independent Executable (PIE) support will allow to extended the > KASLR randomization range below the -2G memory limit. > diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S > b/arch/x86/crypto/aes-x86_64-asm_64.S > index 8739cf7795de..86fa068e5e81 100644 > --- a/arch/x86/crypto/aes-x86_64-asm_64.S > +++ b/arch/x86/crypto/aes-x86_64-asm_64.S > @@ -48,8 +48,12 @@ > #define R10 %r10 > #define R11 %r11 > > +/* Hold global for PIE suport */ > +#define RBASE%r12 > + > #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ > ENTRY(FUNC);\ > + pushq RBASE; \ > movqr1,r2; \ > leaqKEY+48(r8),r9; \ > movqr10,r11;\ > @@ -74,54 +78,63 @@ > movlr6 ## E,4(r9); \ > movlr7 ## E,8(r9); \ > movlr8 ## E,12(r9); \ > + popqRBASE; \ > ret;\ > ENDPROC(FUNC); > > +#define round_mov(tab_off, reg_i, reg_o) \ > + leaqtab_off(%rip), RBASE; \ > + movl(RBASE,reg_i,4), reg_o; > + > +#define round_xor(tab_off, reg_i, reg_o) \ > + leaqtab_off(%rip), RBASE; \ > + xorl(RBASE,reg_i,4), reg_o; > + > #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ > movzbl r2 ## H,r5 ## E;\ > movzbl r2 ## L,r6 ## E;\ > - movlTAB+1024(,r5,4),r5 ## E;\ > + round_mov(TAB+1024, r5, r5 ## E)\ > movwr4 ## X,r2 ## X;\ > - movlTAB(,r6,4),r6 ## E; \ > + round_mov(TAB, r6, r6 ## E) \ > roll$16,r2 ## E;\ > shrl$16,r4 ## E;\ > movzbl r4 ## L,r7 ## E;\ > movzbl r4 ## H,r4 ## E;\ > xorlOFFSET(r8),ra ## E; \ > xorlOFFSET+4(r8),rb ## E; \ > - xorlTAB+3072(,r4,4),r5 ## E;\ > - xorlTAB+2048(,r7,4),r6 ## E;\ > + round_xor(TAB+3072, r4, r5 ## E)\ > + round_xor(TAB+2048, r7, r6 ## E)\ > movzbl r1 ## L,r7 ## E;\ > movzbl r1 ## H,r4 ## E;\ > - movlTAB+1024(,r4,4),r4 ## E;\ > + round_mov(TAB+1024, r4, r4 ## E)\ > movwr3 ## X,r1 ## X;\ > roll$16,r1 ## E;\ > shrl$16,r3 ## E;\ > - xorlTAB(,r7,4),r5 ## E; \ > + round_xor(TAB, r7, r5 ## E) \ > movzbl r3 ## L,r7 ## E;\ > movzbl r3 ## H,r3 ## E;\ > - xorlTAB+3072(,r3,4),r4 ## E;\ > - xorlTAB+2048(,r7,4),r5 ## E;\ > + round_xor(TAB+3072, r3, r4 ## E)\ > + round_xor(TAB+2048, r7, r5 ## E)\ > movzbl r1 ## L,r7 ## E;\ > movzbl r1 ## H,r3 ## E;\ > shrl$16,r1 ## E;\ > - xorlTAB+3072(,r3,4),r6 ## E;\ > - movlTAB+2048(,r7,4),r3 ## E;\ > + round_xor(TAB+3072, r3, r6 ## E)\ > + round_mov(TAB+2048, r7, r3 ## E)\ > movzbl r1 ## L,r7 ## E;\ > movzbl r1 ## H,r1 ## E;\ > - xorlTAB+1024(,r1,4),r6 ## E;\ > - xorlTAB(,r7,4),r3 ## E; \ > + round_xor(TAB+1024, r1, r6 ## E)\ > + round_xor(TAB, r7, r3 ## E) \ > movzbl r2 ## H,r1 ## E;\ > movzbl r2 ## L,r7 ## E;\ > shrl$16,r2 ## E;\ > - xorlTAB+3072(,r1,4),r3 ## E;\ > - xorlTAB+2048(,r7,4),r4 ## E;\ > + round_xor(TAB+3072, r1, r3 ## E)\ > + round_xor(TAB+2048, r7, r4 ## E)\ > movzbl r2 ## H,r1 ## E;\ > movzbl r2 ## L,r2 ## E;\ > xorlOFFSET+8(r8),rc ## E; \ > xorlOFFSET+12(r8),rd ## E; \ > - xorlTAB+1024(,r1,4),r3 ## E;\ > - xorlTAB(,r2,4),r4 ## E; > + round_xor(TAB+1024, r1, r3 ## E)\ > + round_xor(TAB, r2, r4 ## E) This appears to be adding unconditional overhead to a function that was moved to assembly to improve its performance. Thanks, Ingo ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extended the KASLR randomization range below the -2G memory limit. Signed-off-by: Thomas Garnier--- arch/x86/crypto/aes-x86_64-asm_64.S | 45 - arch/x86/crypto/aesni-intel_asm.S| 14 ++-- arch/x86/crypto/aesni-intel_avx-x86_64.S | 6 +- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 42 ++-- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 44 ++--- arch/x86/crypto/camellia-x86_64-asm_64.S | 8 ++- arch/x86/crypto/cast5-avx-x86_64-asm_64.S| 50 --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S| 44 +++-- arch/x86/crypto/des3_ede-asm_64.S| 96 ++-- arch/x86/crypto/ghash-clmulni-intel_asm.S| 4 +- arch/x86/crypto/glue_helper-asm-avx.S| 4 +- arch/x86/crypto/glue_helper-asm-avx2.S | 6 +- 12 files changed, 211 insertions(+), 152 deletions(-) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 8739cf7795de..86fa068e5e81 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -48,8 +48,12 @@ #define R10%r10 #define R11%r11 +/* Hold global for PIE suport */ +#define RBASE %r12 + #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC);\ + pushq RBASE; \ movqr1,r2; \ leaqKEY+48(r8),r9; \ movqr10,r11;\ @@ -74,54 +78,63 @@ movlr6 ## E,4(r9); \ movlr7 ## E,8(r9); \ movlr8 ## E,12(r9); \ + popqRBASE; \ ret;\ ENDPROC(FUNC); +#define round_mov(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + movl(RBASE,reg_i,4), reg_o; + +#define round_xor(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + xorl(RBASE,reg_i,4), reg_o; + #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E;\ movzbl r2 ## L,r6 ## E;\ - movlTAB+1024(,r5,4),r5 ## E;\ + round_mov(TAB+1024, r5, r5 ## E)\ movwr4 ## X,r2 ## X;\ - movlTAB(,r6,4),r6 ## E; \ + round_mov(TAB, r6, r6 ## E) \ roll$16,r2 ## E;\ shrl$16,r4 ## E;\ movzbl r4 ## L,r7 ## E;\ movzbl r4 ## H,r4 ## E;\ xorlOFFSET(r8),ra ## E; \ xorlOFFSET+4(r8),rb ## E; \ - xorlTAB+3072(,r4,4),r5 ## E;\ - xorlTAB+2048(,r7,4),r6 ## E;\ + round_xor(TAB+3072, r4, r5 ## E)\ + round_xor(TAB+2048, r7, r6 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r4 ## E;\ - movlTAB+1024(,r4,4),r4 ## E;\ + round_mov(TAB+1024, r4, r4 ## E)\ movwr3 ## X,r1 ## X;\ roll$16,r1 ## E;\ shrl$16,r3 ## E;\ - xorlTAB(,r7,4),r5 ## E; \ + round_xor(TAB, r7, r5 ## E) \ movzbl r3 ## L,r7 ## E;\ movzbl r3 ## H,r3 ## E;\ - xorlTAB+3072(,r3,4),r4 ## E;\ - xorlTAB+2048(,r7,4),r5 ## E;\ + round_xor(TAB+3072, r3, r4 ## E)\ + round_xor(TAB+2048, r7, r5 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r3 ## E;\ shrl$16,r1 ## E;\ - xorlTAB+3072(,r3,4),r6 ## E;\ - movlTAB+2048(,r7,4),r3 ## E;\ + round_xor(TAB+3072, r3, r6 ## E)\ + round_mov(TAB+2048, r7, r3 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r1 ## E;\ - xorlTAB+1024(,r1,4),r6 ## E;\ - xorlTAB(,r7,4),r3 ## E; \ + round_xor(TAB+1024, r1, r6 ## E)\ + round_xor(TAB, r7, r3 ## E) \ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r7 ## E;\ shrl$16,r2 ## E;\ - xorlTAB+3072(,r1,4),r3 ## E;\ - xorlTAB+2048(,r7,4),r4 ## E;\ + round_xor(TAB+3072, r1, r3 ## E)\ + round_xor(TAB+2048, r7, r4 ## E)\ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r2 ## E;\ xorlOFFSET+8(r8),rc ## E; \ xorlOFFSET+12(r8),rd ## E; \ - xorlTAB+1024(,r1,4),r3 ## E;\ - xorlTAB(,r2,4),r4 ## E; + round_xor(TAB+1024, r1, r3 ## E)\ + round_xor(TAB, r2, r4 ## E) #define move_regs(r1,r2,r3,r4) \ movlr3 ## E,r1 ## E;\ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fec80b2..5f73201dff32 100644 ---