Hi again! > Bottom line. Expect version 1.1 implemention after this weekend:-) And > OK, I can cut-n-paste together v8 version as well if you want me to... Yeah, it's "after this weekend" now... Find two attached files. One is SPARC v9 and another one is SPARC v8 implementations. Even though I intended to cut-n-paste v8 version together from compiler assembler output and my "v9" bn_*_comba[48], I hand-coded the whole thing after all. Disappointly enough v8 got only 10% faster:-( I kind of hoped it would turn out more profitable... V9 in turn runs 30-35% faster now (was 25-30%). Well, that's about what I actually expected from new unrolling method... In either case see comments in v9 source code for details. And again. Anybody feels like discussing following: > ... I don't feel comfortable with > bn_div_words. It looks to me that those functions invoking bn_div_words > would benefit more if *larger* portions of loop bodies surrounding the > call are implemented in assembler. Well, of course provided that bn_div_words is effectively folded into one instruction... > Any opposite opinions? Cheers. Andy.
.ident "bn_asm.sparc.v8plus.S, Version 1.1" .ident "SPARC v9 ISA artwork by Andy Polyakov <[EMAIL PROTECTED]>" /* * ==================================================================== * Copyright (c) 1999 Andy Polyakov <[EMAIL PROTECTED]>. * * Rights for redistribution and usage in source and binary forms are * granted as long as above copyright notices are retained. Warranty * of any kind is (of course:-) disclaimed. * ==================================================================== */ /* * This is my modest contributon to OpenSSL project (see * http://www.openssl.org/ for more information about it) and is * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c * module. For updates see http://fy.chalmers.se/~appro/hpe/. * * Questions-n-answers. * * Q. How to compile? * A. With SC4.x: * * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o * * and with gcc: * * gcc -Wa,-xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o * * Quick-n-dirty way to fuse the module into the library. * Provided that the library is already configured and built * (in 0.9.2 case with no_asm option): * * # cd crypto/bn * # cp /some/place/bn_asm.sparc.v8plus.S . * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o * # make * # cd ../.. * # make; make test * * Quick-n-dirty way to get rid of it: * * # cd crypto/bn * # touch bn_asm.c * # make * # cd ../.. * # make; make test * * Q. Why just UltraSPARC? What about SuperSPARC? * A. Original release did target UltraSPARC only. Now SuperSPARC * version is provided along. Both version share bn_*comba[48] * implementations (see comment later in code for explanation). * But what's so special about this particular implementation? * Why didn't I let compiler do the job? Trouble is that none of * available compilers (not even just introduced SC5.0) attempts * to take advantage of 64-bit registers under 32-bit kernels * even though it's perfectly possible (see next question). * * Q. 64-bit registers under 32-bit kernels? Does it work? * A. You can't use *all* registers as 64-bit wide as not all of 'em * are presereved during context switch:-( It's only %o0-%o5 and * %g1-%g4 you may rely upon *and* only in leaf functions, i.e. such * never calling any other functions. All functions in this module * are leaf and 10 registers is a handful. As a matter of fact none * "comba" routines don't require even that much and I could even * afford not to allocate own stack frame for 'em:-) * * Q. What about 64-bit kernels? * A. What about 'em? Just kidding:-) I unfortunately didn't have a * chance to test it yet, but the below code is 64-bit safe and you * shouldn't have any problem with it. What I probably am saying * here is that I appreciate feedback on the matter... And yes, * you have to feed compiler with -xarch=v9 command line option * instead of -xarch=v8plus. * * Q. What about sharable libraries? * A. What about 'em? Kidding again:-) Code does *not* contain any * code position dependencies and it's safe to include it into * sharable library as is. * * Q. How much faster does it get? * A. Do you have a good benchmark? In either case I experience 30-35% * improvement on UltraSPARC-1 with crypto/bn/expspeed.c test * program. See revision history for details. * */ /* * Revision history. * * 1.0 - initial release * - 25-30% performance boost(*) * 1.1 - new loop unrolling model(**) * - some more fine tuning * - 30-35% performance boost(*) * * (*) bn_asm.c from OpenSSL 0.9.2b compiled with SC4.2, * -xarch=v8 -xstrconst -xO5 -xdepend flags was used as a * reference * (**) Originally unrolled loop looked like this: * for (;;) { * op(p+0); if (--n==0) break; * op(p+1); if (--n==0) break; * op(p+2); if (--n==0) break; * op(p+3); if (--n==0) break; * p+=4; * } * I unroll according to following: * while (n&~3) { * op(p+0); op(p+1); op(p+2); op(p+3); * p+=4; n=-4; * } * if (n) { * op(p+0); if (--n==0) return; * op(p+2); if (--n==0) return; * op(p+3); return; * } */ .section ".text",#alloc,#execinstr .file "bn_asm.sparc.v8plus.S" .align 32 .global bn_mul_add_words /* * BN_ULONG bn_mul_add_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */ bn_mul_add_words: brgz,a %o2,.L_bn_mul_add_words_proceed lduw [%o1],%g2 retl clr %o0 .L_bn_mul_add_words_proceed: clruw %o3 andcc %o2,-4,%g0 bz,pn %icc,.L_bn_mul_add_words_tail clr %o5 .L_bn_mul_add_words_loop: ! wow! 32 aligned! lduw [%o0],%o4 mulx %o3,%g2,%g2 add %o4,%o5,%o4 add %o4,%g2,%o4 lduw [%o1+4],%g3 nop stuw %o4,[%o0] srlx %o4,32,%o5 lduw [%o0+4],%o4 mulx %o3,%g3,%g3 dec 4,%o2 add %o4,%o5,%o4 lduw [%o1+8],%g2 add %o4,%g3,%o4 stuw %o4,[%o0+4] srlx %o4,32,%o5 lduw [%o0+8],%o4 mulx %o3,%g2,%g2 inc 16,%o1 add %o4,%o5,%o4 lduw [%o1-4],%g3 add %o4,%g2,%o4 stuw %o4,[%o0+8] srlx %o4,32,%o5 lduw [%o0+12],%o4 mulx %o3,%g3,%g3 add %o4,%o5,%o4 inc 16,%o0 add %o4,%g3,%o4 srlx %o4,32,%o5 stuw %o4,[%o0-4] andcc %o2,-4,%g0 bnz,a,pt %icc,.L_bn_mul_add_words_loop lduw [%o1],%g2 brnz,a,pn %o2,.L_bn_mul_add_words_tail lduw [%o1],%g2 .L_bn_mul_add_words_return: retl mov %o5,%o0 .L_bn_mul_add_words_tail: lduw [%o0],%o4 mulx %o3,%g2,%g2 add %o4,%o5,%o4 dec %o2 add %o4,%g2,%o4 srlx %o4,32,%o5 brz,pt %o2,.L_bn_mul_add_words_return stuw %o4,[%o0] lduw [%o1+4],%g2 mulx %o3,%g2,%g2 lduw [%o0+4],%o4 add %o4,%o5,%o4 dec %o2 add %o4,%g2,%o4 srlx %o4,32,%o5 brz,pt %o2,.L_bn_mul_add_words_return stuw %o4,[%o0+4] lduw [%o1+8],%g2 mulx %o3,%g2,%g2 lduw [%o0+8],%o4 add %o4,%o5,%o4 add %o4,%g2,%o4 stuw %o4,[%o0+8] retl srlx %o4,32,%o0 .type bn_mul_add_words,2 .size bn_mul_add_words,(.-bn_mul_add_words) .align 32 .global bn_mul_words /* * BN_ULONG bn_mul_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */ bn_mul_words: brgz,a %o2,.L_bn_mul_words_proceeed lduw [%o1],%g2 retl clr %o0 .L_bn_mul_words_proceeed: clruw %o3 andcc %o2,-4,%g0 bz,pn %icc,.L_bn_mul_words_tail clr %o5 .L_bn_mul_words_loop: ! wow! 32 aligned! lduw [%o1+4],%g3 mulx %o3,%g2,%g2 add %g2,%o5,%g2 nop srlx %g2,32,%o5 stuw %g2,[%o0] lduw [%o1+8],%g2 mulx %o3,%g3,%g3 add %g3,%o5,%g3 dec 4,%o2 stuw %g3,[%o0+4] srlx %g3,32,%o5 lduw [%o1+12],%g3 mulx %o3,%g2,%g2 add %g2,%o5,%g2 inc 16,%o1 stuw %g2,[%o0+8] srlx %g2,32,%o5 mulx %o3,%g3,%g3 inc 16,%o0 add %g3,%o5,%g3 nop stuw %g3,[%o0-4] srlx %g3,32,%o5 andcc %o2,-4,%g0 bnz,a,pt %icc,.L_bn_mul_words_loop lduw [%o1],%g2 nop brnz,a,pn %o2,.L_bn_mul_words_tail lduw [%o1],%g2 .L_bn_mul_words_return: retl mov %o5,%o0 .L_bn_mul_words_tail: mulx %o3,%g2,%g2 add %g2,%o5,%g2 dec %o2 srlx %g2,32,%o5 brz,pt %o2,.L_bn_mul_words_return stuw %g2,[%o0] lduw [%o1+4],%g2 mulx %o3,%g2,%g2 add %g2,%o5,%g2 dec %o2 srlx %g2,32,%o5 brz,pt %o2,.L_bn_mul_words_return stuw %g2,[%o0+4] lduw [%o1+8],%g2 mulx %o3,%g2,%g2 add %g2,%o5,%g2 stuw %g2,[%o0+8] retl srlx %g2,32,%o0 .type bn_mul_words,2 .size bn_mul_words,(.-bn_mul_words) .align 32 .global bn_sqr_words /* * void bn_sqr_words(r,a,n) * BN_ULONG *r,*a; * int n; */ bn_sqr_words: brgz,a %o2,.L_bn_sqr_words_proceeed lduw [%o1],%g2 retl clr %o0 .L_bn_sqr_words_proceeed: andcc %o2,-4,%g0 nop bz,pn %icc,.L_bn_sqr_words_tail nop .L_bn_sqr_words_loop: ! wow! 32 aligned! lduw [%o1+4],%g3 mulx %g2,%g2,%o4 stuw %o4,[%o0] srlx %o4,32,%o5 stuw %o5,[%o0+4] nop lduw [%o1+8],%g2 mulx %g3,%g3,%o4 dec 4,%o2 stuw %o4,[%o0+8] srlx %o4,32,%o5 nop stuw %o5,[%o0+12] lduw [%o1+12],%g3 mulx %g2,%g2,%o4 srlx %o4,32,%o5 stuw %o4,[%o0+16] inc 16,%o1 stuw %o5,[%o0+20] mulx %g3,%g3,%o4 inc 32,%o0 stuw %o4,[%o0-8] srlx %o4,32,%o5 andcc %o2,-4,%g2 stuw %o5,[%o0-4] bnz,a,pt %icc,.L_bn_sqr_words_loop lduw [%o1],%g2 nop brnz,a,pn %o2,.L_bn_sqr_words_tail lduw [%o1],%g2 .L_bn_sqr_words_return: retl clr %o0 .L_bn_sqr_words_tail: mulx %g2,%g2,%o4 dec %o2 stuw %o4,[%o0] srlx %o4,32,%o5 brz,pt %o2,.L_bn_sqr_words_return stuw %o5,[%o0+4] lduw [%o1+4],%g2 mulx %g2,%g2,%o4 dec %o2 stuw %o4,[%o0+8] srlx %o4,32,%o5 brz,pt %o2,.L_bn_sqr_words_return stuw %o5,[%o0+12] lduw [%o1+8],%g2 mulx %g2,%g2,%o4 srlx %o4,32,%o5 stuw %o4,[%o0+16] stuw %o5,[%o0+20] retl clr %o0 .type bn_sqr_words,2 .size bn_sqr_words,(.-bn_sqr_words) .align 32 .global bn_div_words /* * BN_ULONG bn_div_words(h,l,d) * BN_ULONG h,l,d; */ bn_div_words: sllx %o0,32,%o0 or %o0,%o1,%o0 udivx %o0,%o2,%o0 retl clruw %o0 .type bn_div_words,2 .size bn_div_words,(.-bn_div_words) .align 32 .global bn_add_words /* * BN_ULONG bn_add_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */ bn_add_words: brgz,a %o3,.L_bn_add_words_proceed lduw [%o1],%o4 retl clr %o0 .L_bn_add_words_proceed: andcc %o3,-4,%g0 bz,pn %icc,.L_bn_add_words_tail addcc %g0,0,%g0 ! clear carry flag nop lduw [%o2],%o5 dec 4,%o3 addcc %o5,%o4,%o5 nop stuw %o5,[%o0] ba .L_bn_add_words_warm_loop lduw [%o1+4],%o4 nop .L_bn_add_words_loop: ! wow! 32 aligned! dec 4,%o3 lduw [%o2],%o5 nop addccc %o5,%o4,%o5 stuw %o5,[%o0] lduw [%o1+4],%o4 .L_bn_add_words_warm_loop: inc 16,%o1 nop lduw [%o2+4],%o5 addccc %o5,%o4,%o5 stuw %o5,[%o0+4] nop lduw [%o1-8],%o4 inc 16,%o2 lduw [%o2-8],%o5 addccc %o5,%o4,%o5 stuw %o5,[%o0+8] lduw [%o1-4],%o4 inc 16,%o0 nop lduw [%o2-4],%o5 addccc %o5,%o4,%o5 stuw %o5,[%o0-4] and %o3,-4,%g1 brnz,a,pt %g1,.L_bn_add_words_loop lduw [%o1],%o4 brnz,a,pn %o3,.L_bn_add_words_tail lduw [%o1],%o4 .L_bn_add_words_return: clr %o0 retl movcs %icc,1,%o0 nop .L_bn_add_words_tail: ! wow! 32 aligned! lduw [%o2],%o5 dec %o3 addccc %o5,%o4,%o5 brz,pt %o3,.L_bn_add_words_return stuw %o5,[%o0] lduw [%o1+4],%o4 lduw [%o2+4],%o5 addccc %o5,%o4,%o5 dec %o3 brz,pt %o3,.L_bn_add_words_return stuw %o5,[%o0+4] nop lduw [%o1+8],%o4 lduw [%o2+8],%o5 addccc %o5,%o4,%o5 nop stuw %o5,[%o0+8] clr %o0 retl movcs %icc,1,%o0 .type bn_add_words,2 .size bn_add_words,(.-bn_add_words) .global bn_sub_words /* * BN_ULONG bn_sub_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */ bn_sub_words: brgz,a %o3,.L_bn_sub_words_proceed lduw [%o1],%o4 retl clr %o0 .L_bn_sub_words_proceed: andcc %o3,-4,%g0 bz,pn %icc,.L_bn_sub_words_tail addcc %g0,0,%g0 ! clear carry flag nop lduw [%o2],%o5 dec 4,%o3 subcc %o4,%o5,%o5 nop stuw %o5,[%o0] ba .L_bn_sub_words_warm_loop lduw [%o1+4],%o4 nop .L_bn_sub_words_loop: ! wow! 32 aligned! dec 4,%o3 lduw [%o2],%o5 nop subccc %o4,%o5,%o5 stuw %o5,[%o0] lduw [%o1+4],%o4 .L_bn_sub_words_warm_loop: inc 16,%o1 nop lduw [%o2+4],%o5 subccc %o4,%o5,%o5 stuw %o5,[%o0+4] nop lduw [%o1-8],%o4 inc 16,%o2 lduw [%o2-8],%o5 subccc %o4,%o5,%o5 stuw %o5,[%o0+8] lduw [%o1-4],%o4 inc 16,%o0 nop lduw [%o2-4],%o5 subccc %o4,%o5,%o5 stuw %o5,[%o0-4] and %o3,-4,%g1 brnz,a,pt %g1,.L_bn_sub_words_loop lduw [%o1],%o4 brnz,a,pn %o3,.L_bn_sub_words_tail lduw [%o1],%o4 .L_bn_sub_words_return: clr %o0 retl movcs %icc,1,%o0 nop .L_bn_sub_words_tail: ! wow! 32 aligned! lduw [%o2],%o5 dec %o3 subccc %o4,%o5,%o5 brz,pt %o3,.L_bn_sub_words_return stuw %o5,[%o0] lduw [%o1+4],%o4 lduw [%o2+4],%o5 subccc %o4,%o5,%o5 dec %o3 brz,pt %o3,.L_bn_sub_words_return stuw %o5,[%o0+4] nop lduw [%o1+8],%o4 lduw [%o2+8],%o5 subccc %o4,%o5,%o5 nop stuw %o5,[%o0+8] clr %o0 retl movcs %icc,1,%o0 .type bn_sub_words,2 .size bn_sub_words,(.-bn_sub_words) /* * Following code is pure SPARC V8! Trouble is that it's not feasible * to implement the mumbo-jumbo in less "V9" instructions:-( At least not * under 32-bit kernel. The reason is that you'd have to shuffle registers * all the time as only few (well, 10:-) are fully (i.e. all 64 bits) * preserved by kernel during context switch. But even under 64-bit kernel * you won't gain much because in the lack of "add with extended carry" * instruction you'd have to issue 'clr %rx; movcs %xcc,1,%rx; * add %rd,%rx,%rd' sequence in place of 'addxcc %rx,%ry,%rx; * addx %rz,%g0,%rz' pair in 32-bit case. * * Andy. */ /* * Basically the only difference between 32-bit and 64-bit versions * is size of minimal stack frame that subroutine should allocate. */ #ifdef __sparcv9 #define FRAME_SIZE -192 #else #define FRAME_SIZE -96 #endif /* * Here is register usage map for *all* routines below. */ #define a_0 %l0 #define a_0_ [%i1] #define a_1 %l1 #define a_1_ [%i1+4] #define a_2 %l2 #define a_2_ [%i1+8] #define a_3 %l3 #define a_3_ [%i1+12] #define a_4 %l4 #define a_4_ [%i1+16] #define a_5 %l5 #define a_5_ [%i1+20] #define a_6 %l6 #define a_6_ [%i1+24] #define a_7 %l7 #define a_7_ [%i1+28] #define b_0 %g1 #define b_0_ [%i2] #define b_1 %g2 #define b_1_ [%i2+4] #define b_2 %g3 #define b_2_ [%i2+8] #define b_3 %g4 #define b_3_ [%i2+12] #define b_4 %i3 #define b_4_ [%i2+16] #define b_5 %i4 #define b_5_ [%i2+20] #define b_6 %i5 #define b_6_ [%i2+24] #define b_7 %o5 #define b_7_ [%i2+28] #define c_1 %o2 #define c_2 %o3 #define c_3 %o4 #define t_1 %o0 #define t_2 %o1 .align 32 .global bn_mul_comba8 /* * void bn_mul_comba8(r,a,b) * BN_ULONG *r,*a,*b; */ bn_mul_comba8: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld b_0_,b_0 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); ld b_1_,b_1 rd %y,c_2 st c_1,[%i0] !r[0]=c1; umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); ld a_1_,a_1 addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 != addx %g0,%g0,c_1 ld a_2_,a_2 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+4] !r[1]=c2; addx c_1,%g0,c_1 != umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 ld b_2_,b_2 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 ld b_3_,b_3 addx c_2,%g0,c_2 != umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != ld a_3_,a_3 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 ld a_4_,a_4 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != ld b_4_,b_4 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 ld b_5_,b_5 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+16] !r[4]=c2; umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 ld a_5_,a_5 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 ld a_6_,a_6 addx c_2,%g0,c_2 != umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+20] !r[5]=c3; umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 ld b_6_,b_6 addx c_3,%g0,c_3 != umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 ld b_7_,b_7 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 st c_1,[%i0+24] !r[6]=c1; addx c_3,%g0,c_3 != umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx %g0,%g0,c_1 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 ld a_7_,a_7 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 st c_2,[%i0+28] !r[7]=c2; umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 ! addx c_2,%g0,c_2 st c_3,[%i0+32] !r[8]=c3; umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+36] !r[9]=c1; umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+40] !r[10]=c2; umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+44] !r[11]=c3; addx c_2,%g0,c_2 != umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 st c_1,[%i0+48] !r[12]=c1; addx c_3,%g0,c_3 != umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx %g0,%g0,c_1 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+52] !r[13]=c2; umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 nop != st c_3,[%i0+56] !r[14]=c3; st c_1,[%i0+60] !r[15]=c1; ret restore %g0,%g0,%o0 .type bn_mul_comba8,2 .size bn_mul_comba8,(.-bn_mul_comba8) .align 32 .global bn_mul_comba4 /* * void bn_mul_comba4(r,a,b) * BN_ULONG *r,*a,*b; */ bn_mul_comba4: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld b_0_,b_0 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); ld b_1_,b_1 rd %y,c_2 st c_1,[%i0] !r[0]=c1; umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); ld a_1_,a_1 addcc c_2,t_1,c_2 rd %y,t_2 != addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 ld a_2_,a_2 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+4] !r[1]=c2; umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 ld b_2_,b_2 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld b_3_,b_3 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 != umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 ld a_3_,a_3 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+16] !r[4]=c2; umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+20] !r[5]=c3; addx c_2,%g0,c_2 != umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != st c_1,[%i0+24] !r[6]=c1; st c_2,[%i0+28] !r[7]=c2; ret restore %g0,%g0,%o0 .type bn_mul_comba4,2 .size bn_mul_comba4,(.-bn_mul_comba4) .align 32 .global bn_sqr_comba8 bn_sqr_comba8: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld a_1_,a_1 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3); rd %y,c_2 st c_1,[%i0] !r[0]=c1; ld a_2_,a_2 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 != addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 st c_2,[%i0+4] !r[1]=c2; addx c_1,%g0,c_1 != umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld a_3_,a_3 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 != addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 ld a_4_,a_4 addx c_3,%g0,c_3 != umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 ld a_5_,a_5 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+16] !r[4]=c2; addx c_1,%g0,c_1 != umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld a_6_,a_6 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+20] !r[5]=c3; umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 ld a_7_,a_7 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+24] !r[6]=c1; umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 st c_2,[%i0+28] !r[7]=c2; umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+32] !r[8]=c3; addx c_2,%g0,c_2 != umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+36] !r[9]=c1; umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+40] !r[10]=c2; umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 st c_3,[%i0+44] !r[11]=c3; addx c_2,%g0,c_2 != umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 st c_1,[%i0+48] !r[12]=c1; umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+52] !r[13]=c2; addx c_1,%g0,c_1 != umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != st c_3,[%i0+56] !r[14]=c3; st c_1,[%i0+60] !r[15]=c1; ret restore %g0,%g0,%o0 .type bn_sqr_comba8,2 .size bn_sqr_comba8,(.-bn_sqr_comba8) .align 32 .global bn_sqr_comba4 /* * void bn_sqr_comba4(r,a) * BN_ULONG *r,*a; */ bn_sqr_comba4: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3); ld a_1_,a_1 != rd %y,c_2 st c_1,[%i0] !r[0]=c1; ld a_1_,a_1 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 != ld a_2_,a_2 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+4] !r[1]=c2; umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 ld a_3_,a_3 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+8] !r[2]=c3; addx c_2,%g0,c_2 != umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+16] !r[4]=c2; umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 st c_3,[%i0+20] !r[5]=c3; addx c_2,%g0,c_2 != umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != st c_1,[%i0+24] !r[6]=c1; st c_2,[%i0+28] !r[7]=c2; ret restore %g0,%g0,%o0 .type bn_sqr_comba4,2 .size bn_sqr_comba4,(.-bn_sqr_comba4) .align 32
.ident "bn_asm.sparc.v8.S, Version 1.1" .ident "SPARC v8 ISA artwork by Andy Polyakov <[EMAIL PROTECTED]>" /* * ==================================================================== * Copyright (c) 1999 Andy Polyakov <[EMAIL PROTECTED]>. * * Rights for redistribution and usage in source and binary forms are * granted as long as above copyright notices are retained. Warranty * of any kind is (of course:-) disclaimed. * ==================================================================== */ /* * This is my modest contributon to OpenSSL project (see * http://www.openssl.org/ for more information about it) and is * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c * module. For updates see http://fy.chalmers.se/~appro/hpe/. * * See bn_asm.sparc.v8plus.S for more details. */ /* * Revision history. * * 1.1 - new loop unrolling model(*) * - 10% performance boost(*) * * (*) see bn_asm.sparc.v8plus.S for details */ .section ".text",#alloc,#execinstr .file "bn_asm.sparc.v8.S" .align 32 .global bn_mul_add_words /* * BN_ULONG bn_mul_add_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */ bn_mul_add_words: cmp %o2,0 bg,a .L_bn_mul_add_words_proceed ld [%o1],%g2 retl clr %o0 .L_bn_mul_add_words_proceed: andcc %o2,-4,%g0 bz .L_bn_mul_add_words_tail clr %o5 umul %o3,%g2,%g2 ld [%o0],%o4 rd %y,%g1 addcc %o4,%g2,%o4 ld [%o1+4],%g3 addx %g1,0,%o5 ba .L_bn_mul_add_words_warm_loop st %o4,[%o0] .L_bn_mul_add_words_loop: ld [%o0],%o4 umul %o3,%g2,%g2 rd %y,%g1 addcc %o4,%o5,%o4 ld [%o1+4],%g3 addx %g1,0,%g1 addcc %o4,%g2,%o4 nop addx %g1,0,%o5 st %o4,[%o0] .L_bn_mul_add_words_warm_loop: ld [%o0+4],%o4 umul %o3,%g3,%g3 dec 4,%o2 rd %y,%g1 addcc %o4,%o5,%o4 ld [%o1+8],%g2 addx %g1,0,%g1 addcc %o4,%g3,%o4 addx %g1,0,%o5 st %o4,[%o0+4] ld [%o0+8],%o4 umul %o3,%g2,%g2 inc 16,%o1 rd %y,%g1 addcc %o4,%o5,%o4 ld [%o1-4],%g3 addx %g1,0,%g1 addcc %o4,%g2,%o4 addx %g1,0,%o5 st %o4,[%o0+8] ld [%o0+12],%o4 umul %o3,%g3,%g3 inc 16,%o0 rd %y,%g1 addcc %o4,%o5,%o4 addx %g1,0,%g1 addcc %o4,%g3,%o4 addx %g1,0,%o5 st %o4,[%o0-4] andcc %o2,-4,%g0 bnz,a .L_bn_mul_add_words_loop ld [%o1],%g2 tst %o2 bnz,a .L_bn_mul_add_words_tail ld [%o1],%g2 .L_bn_mul_add_words_return: retl mov %o5,%o0 nop .L_bn_mul_add_words_tail: ld [%o0],%o4 umul %o3,%g2,%g2 addcc %o4,%o5,%o4 rd %y,%g1 addx %g1,0,%g1 addcc %o4,%g2,%o4 addx %g1,0,%o5 deccc %o2 bz .L_bn_mul_add_words_return st %o4,[%o0] ld [%o1+4],%g2 umul %o3,%g2,%g2 ld [%o0+4],%o4 rd %y,%g1 addcc %o4,%o5,%o4 nop addx %g1,0,%g1 addcc %o4,%g2,%o4 addx %g1,0,%o5 deccc %o2 bz .L_bn_mul_add_words_return st %o4,[%o0+4] ld [%o1+8],%g2 umul %o3,%g2,%g2 ld [%o0+8],%o4 rd %y,%g1 addcc %o4,%o5,%o4 addx %g1,0,%g1 addcc %o4,%g2,%o4 st %o4,[%o0+8] retl addx %g1,0,%o0 .type bn_mul_add_words,2 .size bn_mul_add_words,(.-bn_mul_add_words) .align 32 .global bn_mul_words /* * BN_ULONG bn_mul_words(rp,ap,num,w) * BN_ULONG *rp,*ap; * int num; * BN_ULONG w; */ bn_mul_words: cmp %o2,0 bg,a .L_bn_mul_words_proceeed ld [%o1],%g2 retl clr %o0 .L_bn_mul_words_proceeed: andcc %o2,-4,%g0 bz .L_bn_mul_words_tail clr %o5 .L_bn_mul_words_loop: ld [%o1+4],%g3 umul %o3,%g2,%g2 addcc %g2,%o5,%g2 rd %y,%g1 addx %g1,0,%o5 st %g2,[%o0] ld [%o1+8],%g2 umul %o3,%g3,%g3 addcc %g3,%o5,%g3 rd %y,%g1 dec 4,%o2 addx %g1,0,%o5 st %g3,[%o0+4] ld [%o1+12],%g3 umul %o3,%g2,%g2 addcc %g2,%o5,%g2 rd %y,%g1 inc 16,%o1 st %g2,[%o0+8] addx %g1,0,%o5 umul %o3,%g3,%g3 addcc %g3,%o5,%g3 rd %y,%g1 inc 16,%o0 addx %g1,0,%o5 st %g3,[%o0-4] andcc %o2,-4,%g0 nop bnz,a .L_bn_mul_words_loop ld [%o1],%g2 tst %o2 bnz,a .L_bn_mul_words_tail ld [%o1],%g2 .L_bn_mul_words_return: retl mov %o5,%o0 nop .L_bn_mul_words_tail: umul %o3,%g2,%g2 addcc %g2,%o5,%g2 rd %y,%g1 addx %g1,0,%o5 deccc %o2 bz .L_bn_mul_words_return st %g2,[%o0] nop ld [%o1+4],%g2 umul %o3,%g2,%g2 addcc %g2,%o5,%g2 rd %y,%g1 addx %g1,0,%o5 deccc %o2 bz .L_bn_mul_words_return st %g2,[%o0+4] ld [%o1+8],%g2 umul %o3,%g2,%g2 addcc %g2,%o5,%g2 rd %y,%g1 st %g2,[%o0+8] retl addx %g1,0,%o0 .type bn_mul_words,2 .size bn_mul_words,(.-bn_mul_words) .align 32 .global bn_sqr_words /* * void bn_sqr_words(r,a,n) * BN_ULONG *r,*a; * int n; */ bn_sqr_words: cmp %o2,0 bg,a .L_bn_sqr_words_proceeed ld [%o1],%g2 retl clr %o0 .L_bn_sqr_words_proceeed: andcc %o2,-4,%g0 bz .L_bn_sqr_words_tail clr %o5 .L_bn_sqr_words_loop: ld [%o1+4],%g3 umul %g2,%g2,%o4 st %o4,[%o0] rd %y,%o5 st %o5,[%o0+4] ld [%o1+8],%g2 umul %g3,%g3,%o4 dec 4,%o2 st %o4,[%o0+8] rd %y,%o5 st %o5,[%o0+12] nop ld [%o1+12],%g3 umul %g2,%g2,%o4 st %o4,[%o0+16] rd %y,%o5 inc 16,%o1 st %o5,[%o0+20] umul %g3,%g3,%o4 inc 32,%o0 st %o4,[%o0-8] rd %y,%o5 st %o5,[%o0-4] andcc %o2,-4,%g2 bnz,a .L_bn_sqr_words_loop ld [%o1],%g2 tst %o2 nop bnz,a .L_bn_sqr_words_tail ld [%o1],%g2 .L_bn_sqr_words_return: retl clr %o0 .L_bn_sqr_words_tail: umul %g2,%g2,%o4 st %o4,[%o0] deccc %o2 rd %y,%o5 bz .L_bn_sqr_words_return st %o5,[%o0+4] ld [%o1+4],%g2 umul %g2,%g2,%o4 st %o4,[%o0+8] deccc %o2 rd %y,%o5 nop bz .L_bn_sqr_words_return st %o5,[%o0+12] ld [%o1+8],%g2 umul %g2,%g2,%o4 st %o4,[%o0+16] rd %y,%o5 st %o5,[%o0+20] retl clr %o0 .type bn_sqr_words,2 .size bn_sqr_words,(.-bn_sqr_words) .align 32 .global bn_div_words /* * BN_ULONG bn_div_words(h,l,d) * BN_ULONG h,l,d; */ bn_div_words: wr %o0,%y udiv %o1,%o2,%o0 retl nop .type bn_div_words,2 .size bn_div_words,(.-bn_div_words) .align 32 .global bn_add_words /* * BN_ULONG bn_add_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */ bn_add_words: cmp %o3,0 bg,a .L_bn_add_words_proceed ld [%o1],%o4 retl clr %o0 .L_bn_add_words_proceed: andcc %o3,-4,%g0 bz .L_bn_add_words_tail clr %g1 ld [%o2],%o5 dec 4,%o3 addcc %o5,%o4,%o5 nop st %o5,[%o0] ba .L_bn_add_words_warm_loop ld [%o1+4],%o4 nop .L_bn_add_words_loop: ld [%o1],%o4 dec 4,%o3 ld [%o2],%o5 addxcc %o5,%o4,%o5 st %o5,[%o0] ld [%o1+4],%o4 .L_bn_add_words_warm_loop: inc 16,%o1 ld [%o2+4],%o5 addxcc %o5,%o4,%o5 st %o5,[%o0+4] ld [%o1-8],%o4 inc 16,%o2 ld [%o2-8],%o5 addxcc %o5,%o4,%o5 st %o5,[%o0+8] ld [%o1-4],%o4 inc 16,%o0 ld [%o2-4],%o5 addxcc %o5,%o4,%o5 st %o5,[%o0-4] addx %g0,0,%g1 andcc %o3,-4,%g0 bnz,a .L_bn_add_words_loop addcc %g1,-1,%g0 tst %o3 nop bnz,a .L_bn_add_words_tail ld [%o1],%o4 .L_bn_add_words_return: retl mov %g1,%o0 .L_bn_add_words_tail: addcc %g1,-1,%g0 ld [%o2],%o5 addxcc %o5,%o4,%o5 addx %g0,0,%g1 deccc %o3 bz .L_bn_add_words_return st %o5,[%o0] nop ld [%o1+4],%o4 addcc %g1,-1,%g0 ld [%o2+4],%o5 addxcc %o5,%o4,%o5 addx %g0,0,%g1 deccc %o3 bz .L_bn_add_words_return st %o5,[%o0+4] ld [%o1+8],%o4 addcc %g1,-1,%g0 ld [%o2+8],%o5 addxcc %o5,%o4,%o5 st %o5,[%o0+8] retl addx %g0,0,%o0 .type bn_add_words,2 .size bn_add_words,(.-bn_add_words) .align 32 .global bn_sub_words /* * BN_ULONG bn_sub_words(rp,ap,bp,n) * BN_ULONG *rp,*ap,*bp; * int n; */ bn_sub_words: cmp %o3,0 bg,a .L_bn_sub_words_proceed ld [%o1],%o4 retl clr %o0 .L_bn_sub_words_proceed: andcc %o3,-4,%g0 bz .L_bn_sub_words_tail clr %g1 ld [%o2],%o5 dec 4,%o3 subcc %o4,%o5,%o5 nop st %o5,[%o0] ba .L_bn_sub_words_warm_loop ld [%o1+4],%o4 nop .L_bn_sub_words_loop: ld [%o1],%o4 dec 4,%o3 ld [%o2],%o5 subxcc %o4,%o5,%o5 st %o5,[%o0] ld [%o1+4],%o4 .L_bn_sub_words_warm_loop: inc 16,%o1 ld [%o2+4],%o5 subxcc %o4,%o5,%o5 st %o5,[%o0+4] ld [%o1-8],%o4 inc 16,%o2 ld [%o2-8],%o5 subxcc %o4,%o5,%o5 st %o5,[%o0+8] ld [%o1-4],%o4 inc 16,%o0 ld [%o2-4],%o5 subxcc %o4,%o5,%o5 st %o5,[%o0-4] addx %g0,0,%g1 andcc %o3,-4,%g0 bnz,a .L_bn_sub_words_loop addcc %g1,-1,%g0 tst %o3 nop bnz,a .L_bn_sub_words_tail ld [%o1],%o4 .L_bn_sub_words_return: retl mov %g1,%o0 .L_bn_sub_words_tail: addcc %g1,-1,%g0 ld [%o2],%o5 subxcc %o4,%o5,%o5 addx %g0,0,%g1 deccc %o3 bz .L_bn_sub_words_return st %o5,[%o0] nop ld [%o1+4],%o4 addcc %g1,-1,%g0 ld [%o2+4],%o5 subxcc %o4,%o5,%o5 addx %g0,0,%g1 deccc %o3 bz .L_bn_sub_words_return st %o5,[%o0+4] ld [%o1+8],%o4 addcc %g1,-1,%g0 ld [%o2+8],%o5 subxcc %o4,%o5,%o5 st %o5,[%o0+8] retl addx %g0,0,%o0 .type bn_sub_words,2 .size bn_sub_words,(.-bn_sub_words) #define FRAME_SIZE -96 /* * Here is register usage map for *all* routines below. */ #define a_0 %l0 #define a_0_ [%i1] #define a_1 %l1 #define a_1_ [%i1+4] #define a_2 %l2 #define a_2_ [%i1+8] #define a_3 %l3 #define a_3_ [%i1+12] #define a_4 %l4 #define a_4_ [%i1+16] #define a_5 %l5 #define a_5_ [%i1+20] #define a_6 %l6 #define a_6_ [%i1+24] #define a_7 %l7 #define a_7_ [%i1+28] #define b_0 %g1 #define b_0_ [%i2] #define b_1 %g2 #define b_1_ [%i2+4] #define b_2 %g3 #define b_2_ [%i2+8] #define b_3 %g4 #define b_3_ [%i2+12] #define b_4 %i3 #define b_4_ [%i2+16] #define b_5 %i4 #define b_5_ [%i2+20] #define b_6 %i5 #define b_6_ [%i2+24] #define b_7 %o5 #define b_7_ [%i2+28] #define c_1 %o2 #define c_2 %o3 #define c_3 %o4 #define t_1 %o0 #define t_2 %o1 .align 32 .global bn_mul_comba8 /* * void bn_mul_comba8(r,a,b) * BN_ULONG *r,*a,*b; */ bn_mul_comba8: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld b_0_,b_0 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); ld b_1_,b_1 rd %y,c_2 st c_1,[%i0] !r[0]=c1; umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); ld a_1_,a_1 addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 != addx %g0,%g0,c_1 ld a_2_,a_2 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+4] !r[1]=c2; addx c_1,%g0,c_1 != umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 ld b_2_,b_2 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 ld b_3_,b_3 addx c_2,%g0,c_2 != umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != ld a_3_,a_3 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 ld a_4_,a_4 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != ld b_4_,b_4 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 ld b_5_,b_5 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+16] !r[4]=c2; umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 ld a_5_,a_5 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 ld a_6_,a_6 addx c_2,%g0,c_2 != umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+20] !r[5]=c3; umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 ld b_6_,b_6 addx c_3,%g0,c_3 != umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 ld b_7_,b_7 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 st c_1,[%i0+24] !r[6]=c1; addx c_3,%g0,c_3 != umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx %g0,%g0,c_1 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 ld a_7_,a_7 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 st c_2,[%i0+28] !r[7]=c2; umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 ! addx c_2,%g0,c_2 st c_3,[%i0+32] !r[8]=c3; umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+36] !r[9]=c1; umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+40] !r[10]=c2; umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+44] !r[11]=c3; addx c_2,%g0,c_2 != umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 st c_1,[%i0+48] !r[12]=c1; addx c_3,%g0,c_3 != umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 != addx %g0,%g0,c_1 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+52] !r[13]=c2; umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 nop != st c_3,[%i0+56] !r[14]=c3; st c_1,[%i0+60] !r[15]=c1; ret restore %g0,%g0,%o0 .type bn_mul_comba8,2 .size bn_mul_comba8,(.-bn_mul_comba8) .align 32 .global bn_mul_comba4 /* * void bn_mul_comba4(r,a,b) * BN_ULONG *r,*a,*b; */ bn_mul_comba4: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld b_0_,b_0 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); ld b_1_,b_1 rd %y,c_2 st c_1,[%i0] !r[0]=c1; umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); ld a_1_,a_1 addcc c_2,t_1,c_2 rd %y,t_2 != addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 ld a_2_,a_2 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+4] !r[1]=c2; umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 ld b_2_,b_2 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld b_3_,b_3 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 != umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 ld a_3_,a_3 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+16] !r[4]=c2; umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+20] !r[5]=c3; addx c_2,%g0,c_2 != umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != st c_1,[%i0+24] !r[6]=c1; st c_2,[%i0+28] !r[7]=c2; ret restore %g0,%g0,%o0 .type bn_mul_comba4,2 .size bn_mul_comba4,(.-bn_mul_comba4) .align 32 .global bn_sqr_comba8 bn_sqr_comba8: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 ld a_1_,a_1 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3); rd %y,c_2 st c_1,[%i0] !r[0]=c1; ld a_2_,a_2 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 != addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 st c_2,[%i0+4] !r[1]=c2; addx c_1,%g0,c_1 != umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld a_3_,a_3 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 st c_3,[%i0+8] !r[2]=c3; umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 != addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 ld a_4_,a_4 addx c_3,%g0,c_3 != umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 ld a_5_,a_5 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+16] !r[4]=c2; addx c_1,%g0,c_1 != umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != ld a_6_,a_6 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 st c_3,[%i0+20] !r[5]=c3; umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx %g0,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); addcc c_1,t_1,c_1 != rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 addcc c_1,t_1,c_1 != addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 ld a_7_,a_7 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+24] !r[6]=c1; umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 st c_2,[%i0+28] !r[7]=c2; umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 addcc c_3,t_1,c_3 != addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+32] !r[8]=c3; addx c_2,%g0,c_2 != umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+36] !r[9]=c1; umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+40] !r[10]=c2; umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx c_2,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 st c_3,[%i0+44] !r[11]=c3; addx c_2,%g0,c_2 != umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 st c_1,[%i0+48] !r[12]=c1; umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 != rd %y,t_2 addxcc c_3,t_2,c_3 st c_2,[%i0+52] !r[13]=c2; addx c_1,%g0,c_1 != umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 != st c_3,[%i0+56] !r[14]=c3; st c_1,[%i0+60] !r[15]=c1; ret restore %g0,%g0,%o0 .type bn_sqr_comba8,2 .size bn_sqr_comba8,(.-bn_sqr_comba8) .align 32 .global bn_sqr_comba4 /* * void bn_sqr_comba4(r,a) * BN_ULONG *r,*a; */ bn_sqr_comba4: save %sp,FRAME_SIZE,%sp ld a_0_,a_0 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3); ld a_1_,a_1 != rd %y,c_2 st c_1,[%i0] !r[0]=c1; ld a_1_,a_1 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 addxcc %g0,t_2,c_3 addx %g0,%g0,c_1 != ld a_2_,a_2 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 != st c_2,[%i0+4] !r[1]=c2; umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 != addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 != addx c_2,%g0,c_2 ld a_3_,a_3 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); addcc c_3,t_1,c_3 != rd %y,t_2 addxcc c_1,t_2,c_1 st c_3,[%i0+8] !r[2]=c3; addx c_2,%g0,c_2 != umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx %g0,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != addx c_3,%g0,c_3 addcc c_1,t_1,c_1 addxcc c_2,t_2,c_2 addx c_3,%g0,c_3 != st c_1,[%i0+12] !r[3]=c1; umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx %g0,%g0,c_1 addcc c_2,t_1,c_2 addxcc c_3,t_2,c_3 != addx c_1,%g0,c_1 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); addcc c_2,t_1,c_2 rd %y,t_2 != addxcc c_3,t_2,c_3 addx c_1,%g0,c_1 st c_2,[%i0+16] !r[4]=c2; umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2); addcc c_3,t_1,c_3 rd %y,t_2 addxcc c_1,t_2,c_1 addx %g0,%g0,c_2 != addcc c_3,t_1,c_3 addxcc c_1,t_2,c_1 st c_3,[%i0+20] !r[5]=c3; addx c_2,%g0,c_2 != umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); addcc c_1,t_1,c_1 rd %y,t_2 addxcc c_2,t_2,c_2 != st c_1,[%i0+24] !r[6]=c1; st c_2,[%i0+28] !r[7]=c2; ret restore %g0,%g0,%o0 .type bn_sqr_comba4,2 .size bn_sqr_comba4,(.-bn_sqr_comba4) .align 32
