Hi again!

> Bottom line. Expect version 1.1 implemention after this weekend:-) And
> OK, I can cut-n-paste together v8 version as well if you want me to...
Yeah, it's "after this weekend" now... Find two attached files. One is
SPARC v9 and another one is SPARC v8 implementations. Even though I
intended to cut-n-paste v8 version together from compiler assembler
output and my "v9" bn_*_comba[48], I hand-coded the whole thing after
all. Disappointly enough v8 got only 10% faster:-( I kind of hoped it
would turn out more profitable... V9 in turn runs 30-35% faster now (was
25-30%). Well, that's about what I actually expected from new unrolling
method... In either case see comments in v9 source code for details.

And again. Anybody feels like discussing following:
> ... I don't feel comfortable with
> bn_div_words. It looks to me that those functions invoking bn_div_words
> would benefit more if *larger* portions of loop bodies surrounding the
> call are implemented in assembler.
Well, of course provided that bn_div_words is effectively folded into
one instruction...
> Any opposite opinions?

Cheers. Andy.
.ident  "bn_asm.sparc.v8plus.S, Version 1.1"
.ident  "SPARC v9 ISA artwork by Andy Polyakov <[EMAIL PROTECTED]>"

/*
 * ====================================================================
 * Copyright (c) 1999 Andy Polyakov <[EMAIL PROTECTED]>.
 *
 * Rights for redistribution and usage in source and binary forms are
 * granted as long as above copyright notices are retained. Warranty
 * of any kind is (of course:-) disclaimed.
 * ====================================================================
 */

/*
 * This is my modest contributon to OpenSSL project (see
 * http://www.openssl.org/ for more information about it) and is
 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
 *
 * Questions-n-answers.
 *
 * Q. How to compile?
 * A. With SC4.x:
 *
 *      cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
 *
 *    and with gcc:
 *
 *      gcc -Wa,-xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
 *
 *    Quick-n-dirty way to fuse the module into the library.
 *    Provided that the library is already configured and built
 *    (in 0.9.2 case with no_asm option):
 *
 *      # cd crypto/bn
 *      # cp /some/place/bn_asm.sparc.v8plus.S .
 *      # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
 *      # make
 *      # cd ../..
 *      # make; make test
 *
 *    Quick-n-dirty way to get rid of it:
 *
 *      # cd crypto/bn
 *      # touch bn_asm.c
 *      # make
 *      # cd ../..
 *      # make; make test
 *
 * Q. Why just UltraSPARC? What about SuperSPARC?
 * A. Original release did target UltraSPARC only. Now SuperSPARC
 *    version is provided along. Both version share bn_*comba[48]
 *    implementations (see comment later in code for explanation).
 *    But what's so special about this particular implementation?
 *    Why didn't I let compiler do the job? Trouble is that none of
 *    available compilers (not even just introduced SC5.0) attempts
 *    to take advantage of 64-bit registers under 32-bit kernels
 *    even though it's perfectly possible (see next question).
 *
 * Q. 64-bit registers under 32-bit kernels? Does it work?
 * A. You can't use *all* registers as 64-bit wide as not all of 'em
 *    are presereved during context switch:-( It's only %o0-%o5 and
 *    %g1-%g4 you may rely upon *and* only in leaf functions, i.e. such
 *    never calling any other functions. All functions in this module
 *    are leaf and 10 registers is a handful. As a matter of fact none
 *    "comba" routines don't require even that much and I could even
 *    afford not to allocate own stack frame for 'em:-)
 *
 * Q. What about 64-bit kernels?
 * A. What about 'em? Just kidding:-) I unfortunately didn't have a
 *    chance to test it yet, but the below code is 64-bit safe and you
 *    shouldn't have any problem with it. What I probably am saying
 *    here is that I appreciate feedback on the matter... And yes,
 *    you have to feed compiler with -xarch=v9 command line option
 *    instead of -xarch=v8plus.
 *
 * Q. What about sharable libraries?
 * A. What about 'em? Kidding again:-) Code does *not* contain any
 *    code position dependencies and it's safe to include it into
 *    sharable library as is.
 *
 * Q. How much faster does it get?
 * A. Do you have a good benchmark? In either case I experience 30-35%
 *    improvement on UltraSPARC-1 with crypto/bn/expspeed.c test
 *    program. See revision history for details.
 *
 */

/*
 * Revision history.
 *
 * 1.0  - initial release
 *      - 25-30% performance boost(*)
 * 1.1  - new loop unrolling model(**)
 *      - some more fine tuning
 *      - 30-35% performance boost(*)
 *
 * (*)  bn_asm.c from OpenSSL 0.9.2b compiled with SC4.2,
 *      -xarch=v8 -xstrconst -xO5 -xdepend flags was used as a
 *      reference
 * (**) Originally unrolled loop looked like this:
 *          for (;;) {
 *              op(p+0); if (--n==0) break;
 *              op(p+1); if (--n==0) break;
 *              op(p+2); if (--n==0) break;
 *              op(p+3); if (--n==0) break;
 *              p+=4;
 *          }
 *      I unroll according to following:
 *          while (n&~3) {
 *              op(p+0); op(p+1); op(p+2); op(p+3);
 *              p+=4; n=-4;
 *          }
 *          if (n) {
 *              op(p+0); if (--n==0) return;
 *              op(p+2); if (--n==0) return;
 *              op(p+3); return;
 *          }
 */

.section        ".text",#alloc,#execinstr
.file           "bn_asm.sparc.v8plus.S"

.align  32

.global bn_mul_add_words
/*
 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 * BN_ULONG *rp,*ap;
 * int num;
 * BN_ULONG w;
 */
bn_mul_add_words:
        brgz,a  %o2,.L_bn_mul_add_words_proceed
        lduw    [%o1],%g2
        retl
        clr     %o0

.L_bn_mul_add_words_proceed:
        clruw   %o3
        andcc   %o2,-4,%g0
        bz,pn   %icc,.L_bn_mul_add_words_tail
        clr     %o5

.L_bn_mul_add_words_loop:       ! wow! 32 aligned!
        lduw    [%o0],%o4
        mulx    %o3,%g2,%g2
        add     %o4,%o5,%o4
        add     %o4,%g2,%o4
        lduw    [%o1+4],%g3
        nop
        stuw    %o4,[%o0]
        srlx    %o4,32,%o5

        lduw    [%o0+4],%o4
        mulx    %o3,%g3,%g3
        dec     4,%o2
        add     %o4,%o5,%o4
        lduw    [%o1+8],%g2
        add     %o4,%g3,%o4
        stuw    %o4,[%o0+4]
        srlx    %o4,32,%o5

        lduw    [%o0+8],%o4
        mulx    %o3,%g2,%g2
        inc     16,%o1
        add     %o4,%o5,%o4
        lduw    [%o1-4],%g3
        add     %o4,%g2,%o4
        stuw    %o4,[%o0+8]
        srlx    %o4,32,%o5

        lduw    [%o0+12],%o4
        mulx    %o3,%g3,%g3
        add     %o4,%o5,%o4
        inc     16,%o0
        add     %o4,%g3,%o4
        srlx    %o4,32,%o5
        stuw    %o4,[%o0-4]
        andcc   %o2,-4,%g0
        bnz,a,pt        %icc,.L_bn_mul_add_words_loop
        lduw    [%o1],%g2

        brnz,a,pn       %o2,.L_bn_mul_add_words_tail
        lduw    [%o1],%g2
.L_bn_mul_add_words_return:
        retl
        mov     %o5,%o0

.L_bn_mul_add_words_tail:
        lduw    [%o0],%o4
        mulx    %o3,%g2,%g2
        add     %o4,%o5,%o4
        dec     %o2
        add     %o4,%g2,%o4
        srlx    %o4,32,%o5
        brz,pt  %o2,.L_bn_mul_add_words_return
        stuw    %o4,[%o0]

        lduw    [%o1+4],%g2
        mulx    %o3,%g2,%g2
        lduw    [%o0+4],%o4
        add     %o4,%o5,%o4
        dec     %o2
        add     %o4,%g2,%o4
        srlx    %o4,32,%o5
        brz,pt  %o2,.L_bn_mul_add_words_return
        stuw    %o4,[%o0+4]

        lduw    [%o1+8],%g2
        mulx    %o3,%g2,%g2
        lduw    [%o0+8],%o4
        add     %o4,%o5,%o4
        add     %o4,%g2,%o4
        stuw    %o4,[%o0+8]
        retl
        srlx    %o4,32,%o0

.type   bn_mul_add_words,2
.size   bn_mul_add_words,(.-bn_mul_add_words)

.align  32

.global bn_mul_words
/*
 * BN_ULONG bn_mul_words(rp,ap,num,w)
 * BN_ULONG *rp,*ap;
 * int num;
 * BN_ULONG w;
 */
bn_mul_words:
        brgz,a  %o2,.L_bn_mul_words_proceeed
        lduw    [%o1],%g2
        retl
        clr     %o0

.L_bn_mul_words_proceeed:
        clruw   %o3
        andcc   %o2,-4,%g0
        bz,pn   %icc,.L_bn_mul_words_tail
        clr     %o5

.L_bn_mul_words_loop:           ! wow! 32 aligned!
        lduw    [%o1+4],%g3
        mulx    %o3,%g2,%g2
        add     %g2,%o5,%g2
        nop
        srlx    %g2,32,%o5
        stuw    %g2,[%o0]

        lduw    [%o1+8],%g2
        mulx    %o3,%g3,%g3
        add     %g3,%o5,%g3
        dec     4,%o2
        stuw    %g3,[%o0+4]
        srlx    %g3,32,%o5

        lduw    [%o1+12],%g3
        mulx    %o3,%g2,%g2
        add     %g2,%o5,%g2
        inc     16,%o1
        stuw    %g2,[%o0+8]
        srlx    %g2,32,%o5

        mulx    %o3,%g3,%g3
        inc     16,%o0
        add     %g3,%o5,%g3
        nop
        stuw    %g3,[%o0-4]
        srlx    %g3,32,%o5
        andcc   %o2,-4,%g0
        bnz,a,pt        %icc,.L_bn_mul_words_loop
        lduw    [%o1],%g2
        nop

        brnz,a,pn       %o2,.L_bn_mul_words_tail
        lduw    [%o1],%g2
.L_bn_mul_words_return:
        retl
        mov     %o5,%o0

.L_bn_mul_words_tail:
        mulx    %o3,%g2,%g2
        add     %g2,%o5,%g2
        dec     %o2
        srlx    %g2,32,%o5
        brz,pt  %o2,.L_bn_mul_words_return
        stuw    %g2,[%o0]

        lduw    [%o1+4],%g2
        mulx    %o3,%g2,%g2
        add     %g2,%o5,%g2
        dec     %o2
        srlx    %g2,32,%o5
        brz,pt  %o2,.L_bn_mul_words_return
        stuw    %g2,[%o0+4]

        lduw    [%o1+8],%g2
        mulx    %o3,%g2,%g2
        add     %g2,%o5,%g2
        stuw    %g2,[%o0+8]
        retl
        srlx    %g2,32,%o0

.type   bn_mul_words,2
.size   bn_mul_words,(.-bn_mul_words)

.align  32
.global bn_sqr_words
/*
 * void bn_sqr_words(r,a,n)
 * BN_ULONG *r,*a;
 * int n;
 */
bn_sqr_words:
        brgz,a  %o2,.L_bn_sqr_words_proceeed
        lduw    [%o1],%g2
        retl
        clr     %o0

.L_bn_sqr_words_proceeed:
        andcc   %o2,-4,%g0
        nop
        bz,pn   %icc,.L_bn_sqr_words_tail
        nop

.L_bn_sqr_words_loop:           ! wow! 32 aligned!
        lduw    [%o1+4],%g3
        mulx    %g2,%g2,%o4
        stuw    %o4,[%o0]
        srlx    %o4,32,%o5
        stuw    %o5,[%o0+4]
        nop

        lduw    [%o1+8],%g2
        mulx    %g3,%g3,%o4
        dec     4,%o2
        stuw    %o4,[%o0+8]
        srlx    %o4,32,%o5
        nop
        stuw    %o5,[%o0+12]

        lduw    [%o1+12],%g3
        mulx    %g2,%g2,%o4
        srlx    %o4,32,%o5
        stuw    %o4,[%o0+16]
        inc     16,%o1
        stuw    %o5,[%o0+20]

        mulx    %g3,%g3,%o4
        inc     32,%o0
        stuw    %o4,[%o0-8]
        srlx    %o4,32,%o5
        andcc   %o2,-4,%g2
        stuw    %o5,[%o0-4]
        bnz,a,pt        %icc,.L_bn_sqr_words_loop
        lduw    [%o1],%g2
        nop

        brnz,a,pn       %o2,.L_bn_sqr_words_tail
        lduw    [%o1],%g2
.L_bn_sqr_words_return:
        retl
        clr     %o0

.L_bn_sqr_words_tail:
        mulx    %g2,%g2,%o4
        dec     %o2
        stuw    %o4,[%o0]
        srlx    %o4,32,%o5
        brz,pt  %o2,.L_bn_sqr_words_return
        stuw    %o5,[%o0+4]

        lduw    [%o1+4],%g2
        mulx    %g2,%g2,%o4
        dec     %o2
        stuw    %o4,[%o0+8]
        srlx    %o4,32,%o5
        brz,pt  %o2,.L_bn_sqr_words_return
        stuw    %o5,[%o0+12]

        lduw    [%o1+8],%g2
        mulx    %g2,%g2,%o4
        srlx    %o4,32,%o5
        stuw    %o4,[%o0+16]
        stuw    %o5,[%o0+20]
        retl
        clr     %o0

.type   bn_sqr_words,2
.size   bn_sqr_words,(.-bn_sqr_words)

.align  32
.global bn_div_words
/*
 * BN_ULONG bn_div_words(h,l,d)
 * BN_ULONG h,l,d;
 */
bn_div_words:
        sllx    %o0,32,%o0
        or      %o0,%o1,%o0
        udivx   %o0,%o2,%o0
        retl
        clruw   %o0

.type   bn_div_words,2
.size   bn_div_words,(.-bn_div_words)

.align  32

.global bn_add_words
/*
 * BN_ULONG bn_add_words(rp,ap,bp,n)
 * BN_ULONG *rp,*ap,*bp;
 * int n;
 */
bn_add_words:
        brgz,a  %o3,.L_bn_add_words_proceed
        lduw    [%o1],%o4
        retl
        clr     %o0

.L_bn_add_words_proceed:
        andcc   %o3,-4,%g0
        bz,pn   %icc,.L_bn_add_words_tail
        addcc   %g0,0,%g0       ! clear carry flag
        nop
        lduw    [%o2],%o5
        dec     4,%o3
        addcc   %o5,%o4,%o5
        nop
        stuw    %o5,[%o0]
        ba      .L_bn_add_words_warm_loop
        lduw    [%o1+4],%o4
        nop

.L_bn_add_words_loop:           ! wow! 32 aligned!
        dec     4,%o3
        lduw    [%o2],%o5
        nop
        addccc  %o5,%o4,%o5
        stuw    %o5,[%o0]

        lduw    [%o1+4],%o4
.L_bn_add_words_warm_loop:
        inc     16,%o1
        nop
        lduw    [%o2+4],%o5
        addccc  %o5,%o4,%o5
        stuw    %o5,[%o0+4]
        nop
        
        lduw    [%o1-8],%o4
        inc     16,%o2
        lduw    [%o2-8],%o5
        addccc  %o5,%o4,%o5
        stuw    %o5,[%o0+8]

        lduw    [%o1-4],%o4
        inc     16,%o0
        nop
        lduw    [%o2-4],%o5
        addccc  %o5,%o4,%o5
        stuw    %o5,[%o0-4]
        and     %o3,-4,%g1
        brnz,a,pt       %g1,.L_bn_add_words_loop
        lduw    [%o1],%o4

        brnz,a,pn       %o3,.L_bn_add_words_tail
        lduw    [%o1],%o4
.L_bn_add_words_return:
        clr     %o0
        retl
        movcs   %icc,1,%o0
        nop

.L_bn_add_words_tail:           ! wow! 32 aligned!
        lduw    [%o2],%o5
        dec     %o3
        addccc  %o5,%o4,%o5
        brz,pt  %o3,.L_bn_add_words_return
        stuw    %o5,[%o0]

        lduw    [%o1+4],%o4
        lduw    [%o2+4],%o5
        addccc  %o5,%o4,%o5
        dec     %o3
        brz,pt  %o3,.L_bn_add_words_return
        stuw    %o5,[%o0+4]
        nop

        lduw    [%o1+8],%o4
        lduw    [%o2+8],%o5
        addccc  %o5,%o4,%o5
        nop
        stuw    %o5,[%o0+8]
        clr     %o0
        retl
        movcs   %icc,1,%o0

.type   bn_add_words,2
.size   bn_add_words,(.-bn_add_words)

.global bn_sub_words
/*
 * BN_ULONG bn_sub_words(rp,ap,bp,n)
 * BN_ULONG *rp,*ap,*bp;
 * int n;
 */
bn_sub_words:
        brgz,a  %o3,.L_bn_sub_words_proceed
        lduw    [%o1],%o4
        retl
        clr     %o0

.L_bn_sub_words_proceed:
        andcc   %o3,-4,%g0
        bz,pn   %icc,.L_bn_sub_words_tail
        addcc   %g0,0,%g0       ! clear carry flag
        nop
        lduw    [%o2],%o5
        dec     4,%o3
        subcc   %o4,%o5,%o5
        nop
        stuw    %o5,[%o0]
        ba      .L_bn_sub_words_warm_loop
        lduw    [%o1+4],%o4
        nop

.L_bn_sub_words_loop:           ! wow! 32 aligned!
        dec     4,%o3
        lduw    [%o2],%o5
        nop
        subccc  %o4,%o5,%o5
        stuw    %o5,[%o0]

        lduw    [%o1+4],%o4
.L_bn_sub_words_warm_loop:
        inc     16,%o1
        nop
        lduw    [%o2+4],%o5
        subccc  %o4,%o5,%o5
        stuw    %o5,[%o0+4]
        nop
        
        lduw    [%o1-8],%o4
        inc     16,%o2
        lduw    [%o2-8],%o5
        subccc  %o4,%o5,%o5
        stuw    %o5,[%o0+8]

        lduw    [%o1-4],%o4
        inc     16,%o0
        nop
        lduw    [%o2-4],%o5
        subccc  %o4,%o5,%o5
        stuw    %o5,[%o0-4]
        and     %o3,-4,%g1
        brnz,a,pt       %g1,.L_bn_sub_words_loop
        lduw    [%o1],%o4

        brnz,a,pn       %o3,.L_bn_sub_words_tail
        lduw    [%o1],%o4
.L_bn_sub_words_return:
        clr     %o0
        retl
        movcs   %icc,1,%o0
        nop

.L_bn_sub_words_tail:           ! wow! 32 aligned!
        lduw    [%o2],%o5
        dec     %o3
        subccc  %o4,%o5,%o5
        brz,pt  %o3,.L_bn_sub_words_return
        stuw    %o5,[%o0]

        lduw    [%o1+4],%o4
        lduw    [%o2+4],%o5
        subccc  %o4,%o5,%o5
        dec     %o3
        brz,pt  %o3,.L_bn_sub_words_return
        stuw    %o5,[%o0+4]
        nop

        lduw    [%o1+8],%o4
        lduw    [%o2+8],%o5
        subccc  %o4,%o5,%o5
        nop
        stuw    %o5,[%o0+8]
        clr     %o0
        retl
        movcs   %icc,1,%o0

.type   bn_sub_words,2
.size   bn_sub_words,(.-bn_sub_words)

/*
 * Following code is pure SPARC V8! Trouble is that it's not feasible
 * to implement the mumbo-jumbo in less "V9" instructions:-( At least not
 * under 32-bit kernel. The reason is that you'd have to shuffle registers
 * all the time as only few (well, 10:-) are fully (i.e. all 64 bits)
 * preserved by kernel during context switch. But even under 64-bit kernel
 * you won't gain much because in the lack of "add with extended carry"
 * instruction you'd have to issue 'clr %rx; movcs %xcc,1,%rx;
 * add %rd,%rx,%rd' sequence in place of 'addxcc %rx,%ry,%rx;
 * addx %rz,%g0,%rz' pair in 32-bit case.
 *
 *                                                      Andy.
 */

/*
 * Basically the only difference between 32-bit and 64-bit versions
 * is size of minimal stack frame that subroutine should allocate.
 */
#ifdef __sparcv9
#define FRAME_SIZE      -192
#else
#define FRAME_SIZE      -96
#endif

/*
 * Here is register usage map for *all* routines below.
 */
#define a_0     %l0
#define a_0_    [%i1]
#define a_1     %l1
#define a_1_    [%i1+4]
#define a_2     %l2
#define a_2_    [%i1+8]
#define a_3     %l3
#define a_3_    [%i1+12]
#define a_4     %l4
#define a_4_    [%i1+16]
#define a_5     %l5
#define a_5_    [%i1+20]
#define a_6     %l6
#define a_6_    [%i1+24]
#define a_7     %l7
#define a_7_    [%i1+28]
#define b_0     %g1
#define b_0_    [%i2]
#define b_1     %g2
#define b_1_    [%i2+4]
#define b_2     %g3
#define b_2_    [%i2+8]
#define b_3     %g4
#define b_3_    [%i2+12]
#define b_4     %i3
#define b_4_    [%i2+16]
#define b_5     %i4
#define b_5_    [%i2+20]
#define b_6     %i5
#define b_6_    [%i2+24]
#define b_7     %o5
#define b_7_    [%i2+28]
#define c_1     %o2
#define c_2     %o3
#define c_3     %o4
#define t_1     %o0
#define t_2     %o1

.align  32
.global bn_mul_comba8
/*
 * void bn_mul_comba8(r,a,b)
 * BN_ULONG *r,*a,*b;
 */
bn_mul_comba8:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      b_0_,b_0
        umul    a_0,b_0,c_1     !=!mul_add_c(a[0],b[0],c1,c2,c3);
        ld      b_1_,b_1
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        umul    a_0,b_1,t_1     !=!mul_add_c(a[0],b[1],c2,c3,c1);
        ld      a_1_,a_1
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3     !=
        addx    %g0,%g0,c_1
        ld      a_2_,a_2
        umul    a_1,b_0,t_1     !mul_add_c(a[1],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+4]     !r[1]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        ld      b_2_,b_2
        umul    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        ld      b_3_,b_3
        addx    c_2,%g0,c_2     !=
        umul    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        ld      a_3_,a_3
        umul    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        ld      a_4_,a_4
        umul    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        ld      b_4_,b_4
        umul    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        ld      b_5_,b_5
        umul    a_0,b_4,t_1     !=!mul_add_c(a[0],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_2,b_3,t_1     !=!mul_add_c(a[2],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        ld      a_5_,a_5
        umul    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        ld      a_6_,a_6
        addx    c_2,%g0,c_2     !=
        umul    a_5,b_0,t_1     !mul_add_c(a[5],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+20]    !r[5]=c3;

        umul    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,b_2,t_1     !mul_add_c(a[4],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        umul    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_2,b_4,t_1     !mul_add_c(a[2],b[4],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        ld      b_6_,b_6
        addx    c_3,%g0,c_3     !=
        umul    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        ld      b_7_,b_7
        umul    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        st      c_1,[%i0+24]    !r[6]=c1;
        addx    c_3,%g0,c_3     !=

        umul    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    %g0,%g0,c_1
        umul    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_3,b_4,t_1     !=!mul_add_c(a[3],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        ld      a_7_,a_7
        umul    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_7,b_0,t_1     !mul_add_c(a[7],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        st      c_2,[%i0+28]    !r[7]=c2;

        umul    a_7,b_1,t_1     !mul_add_c(a[7],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_6,b_2,t_1     !=!mul_add_c(a[6],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        umul    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_2,b_6,t_1     !=!mul_add_c(a[2],b[6],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !
        addx    c_2,%g0,c_2
        st      c_3,[%i0+32]    !r[8]=c3;

        umul    a_2,b_7,t_1     !mul_add_c(a[2],b[7],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_3,b_6,t_1     !=!mul_add_c(a[3],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        umul    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_7,b_2,t_1     !=!mul_add_c(a[7],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+36]    !r[9]=c1;

        umul    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_5,b_5,t_1     !=!mul_add_c(a[5],b[5],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+40]    !r[10]=c2;

        umul    a_4,b_7,t_1     !=!mul_add_c(a[4],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        umul    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        umul    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+44]    !r[11]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        umul    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        st      c_1,[%i0+48]    !r[12]=c1;
        addx    c_3,%g0,c_3     !=

        umul    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    %g0,%g0,c_1
        umul    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+52]    !r[13]=c2;

        umul    a_7,b_7,t_1     !=!mul_add_c(a[7],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        nop                     !=
        st      c_3,[%i0+56]    !r[14]=c3;
        st      c_1,[%i0+60]    !r[15]=c1;

        ret
        restore %g0,%g0,%o0

.type   bn_mul_comba8,2
.size   bn_mul_comba8,(.-bn_mul_comba8)

.align  32

.global bn_mul_comba4
/*
 * void bn_mul_comba4(r,a,b)
 * BN_ULONG *r,*a,*b;
 */
bn_mul_comba4:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      b_0_,b_0
        umul    a_0,b_0,c_1     !=!mul_add_c(a[0],b[0],c1,c2,c3);
        ld      b_1_,b_1
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        umul    a_0,b_1,t_1     !=!mul_add_c(a[0],b[1],c2,c3,c1);
        ld      a_1_,a_1
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1
        ld      a_2_,a_2
        umul    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+4]     !r[1]=c2;

        umul    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        ld      b_2_,b_2
        umul    a_1,b_1,t_1     !=!mul_add_c(a[1],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      b_3_,b_3
        umul    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,b_3,t_1     !=!mul_add_c(a[0],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3     !=
        umul    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        ld      a_3_,a_3
        umul    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_1,b_3,t_1     !=!mul_add_c(a[1],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+20]    !r[5]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        st      c_1,[%i0+24]    !r[6]=c1;
        st      c_2,[%i0+28]    !r[7]=c2;
        
        ret
        restore %g0,%g0,%o0

.type   bn_mul_comba4,2
.size   bn_mul_comba4,(.-bn_mul_comba4)

.align  32

.global bn_sqr_comba8
bn_sqr_comba8:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      a_1_,a_1
        umul    a_0,a_0,c_1     !=!sqr_add_c(a,0,c1,c2,c3);
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        ld      a_2_,a_2
        umul    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1     !=
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+4]     !r[1]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      a_3_,a_3
        umul    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,a_3,t_1     !=!sqr_add_c2(a,3,0,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3     !=
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        ld      a_4_,a_4
        addx    c_3,%g0,c_3     !=
        umul    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        ld      a_5_,a_5
        umul    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+16]    !r[4]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      a_6_,a_6
        umul    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+20]    !r[5]=c3;

        umul    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        ld      a_7_,a_7
        umul    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+24]    !r[6]=c1;

        umul    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        st      c_2,[%i0+28]    !r[7]=c2;

        umul    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+32]    !r[8]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+36]    !r[9]=c1;

        umul    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+40]    !r[10]=c2;

        umul    a_4,a_7,t_1     !=!sqr_add_c2(a,7,4,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_5,a_6,t_1     !=!sqr_add_c2(a,6,5,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+44]    !r[11]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        st      c_1,[%i0+48]    !r[12]=c1;

        umul    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+52]    !r[13]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        st      c_3,[%i0+56]    !r[14]=c3;
        st      c_1,[%i0+60]    !r[15]=c1;

        ret
        restore %g0,%g0,%o0

.type   bn_sqr_comba8,2
.size   bn_sqr_comba8,(.-bn_sqr_comba8)

.align  32

.global bn_sqr_comba4
/*
 * void bn_sqr_comba4(r,a)
 * BN_ULONG *r,*a;
 */
bn_sqr_comba4:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        umul    a_0,a_0,c_1     !sqr_add_c(a,0,c1,c2,c3);
        ld      a_1_,a_1        !=
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        ld      a_1_,a_1
        umul    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1     !=
        ld      a_2_,a_2
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+4]     !r[1]=c2;

        umul    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        ld      a_3_,a_3
        umul    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+8]     !r[2]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_2,a_3,t_1     !=!sqr_add_c2(a,3,2,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+20]    !r[5]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        st      c_1,[%i0+24]    !r[6]=c1;
        st      c_2,[%i0+28]    !r[7]=c2;
        
        ret
        restore %g0,%g0,%o0

.type   bn_sqr_comba4,2
.size   bn_sqr_comba4,(.-bn_sqr_comba4)
.align  32
.ident  "bn_asm.sparc.v8.S, Version 1.1"
.ident  "SPARC v8 ISA artwork by Andy Polyakov <[EMAIL PROTECTED]>"

/*
 * ====================================================================
 * Copyright (c) 1999 Andy Polyakov <[EMAIL PROTECTED]>.
 *
 * Rights for redistribution and usage in source and binary forms are
 * granted as long as above copyright notices are retained. Warranty
 * of any kind is (of course:-) disclaimed.
 * ====================================================================
 */

/*
 * This is my modest contributon to OpenSSL project (see
 * http://www.openssl.org/ for more information about it) and is
 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
 *
 * See bn_asm.sparc.v8plus.S for more details.
 */

/*
 * Revision history.
 *
 * 1.1  - new loop unrolling model(*)
 *      - 10% performance boost(*)
 *
 * (*)  see bn_asm.sparc.v8plus.S for details
 */

.section        ".text",#alloc,#execinstr
.file           "bn_asm.sparc.v8.S"

.align  32

.global bn_mul_add_words
/*
 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 * BN_ULONG *rp,*ap;
 * int num;
 * BN_ULONG w;
 */
bn_mul_add_words:
        cmp     %o2,0
        bg,a    .L_bn_mul_add_words_proceed
        ld      [%o1],%g2
        retl
        clr     %o0

.L_bn_mul_add_words_proceed:
        andcc   %o2,-4,%g0
        bz      .L_bn_mul_add_words_tail
        clr     %o5

        umul    %o3,%g2,%g2
        ld      [%o0],%o4
        rd      %y,%g1
        addcc   %o4,%g2,%o4
        ld      [%o1+4],%g3
        addx    %g1,0,%o5
        ba      .L_bn_mul_add_words_warm_loop
        st      %o4,[%o0]

.L_bn_mul_add_words_loop:
        ld      [%o0],%o4
        umul    %o3,%g2,%g2
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        ld      [%o1+4],%g3
        addx    %g1,0,%g1
        addcc   %o4,%g2,%o4
        nop
        addx    %g1,0,%o5
        st      %o4,[%o0]

.L_bn_mul_add_words_warm_loop:
        ld      [%o0+4],%o4
        umul    %o3,%g3,%g3
        dec     4,%o2
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        ld      [%o1+8],%g2
        addx    %g1,0,%g1
        addcc   %o4,%g3,%o4
        addx    %g1,0,%o5
        st      %o4,[%o0+4]

        ld      [%o0+8],%o4
        umul    %o3,%g2,%g2
        inc     16,%o1
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        ld      [%o1-4],%g3
        addx    %g1,0,%g1
        addcc   %o4,%g2,%o4
        addx    %g1,0,%o5
        st      %o4,[%o0+8]

        ld      [%o0+12],%o4
        umul    %o3,%g3,%g3
        inc     16,%o0
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        addx    %g1,0,%g1
        addcc   %o4,%g3,%o4
        addx    %g1,0,%o5
        st      %o4,[%o0-4]
        andcc   %o2,-4,%g0
        bnz,a   .L_bn_mul_add_words_loop
        ld      [%o1],%g2

        tst     %o2
        bnz,a   .L_bn_mul_add_words_tail
        ld      [%o1],%g2
.L_bn_mul_add_words_return:
        retl
        mov     %o5,%o0
        nop

.L_bn_mul_add_words_tail:
        ld      [%o0],%o4
        umul    %o3,%g2,%g2
        addcc   %o4,%o5,%o4
        rd      %y,%g1
        addx    %g1,0,%g1
        addcc   %o4,%g2,%o4
        addx    %g1,0,%o5
        deccc   %o2
        bz      .L_bn_mul_add_words_return
        st      %o4,[%o0]

        ld      [%o1+4],%g2
        umul    %o3,%g2,%g2
        ld      [%o0+4],%o4
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        nop
        addx    %g1,0,%g1
        addcc   %o4,%g2,%o4
        addx    %g1,0,%o5
        deccc   %o2
        bz      .L_bn_mul_add_words_return
        st      %o4,[%o0+4]

        ld      [%o1+8],%g2
        umul    %o3,%g2,%g2
        ld      [%o0+8],%o4
        rd      %y,%g1
        addcc   %o4,%o5,%o4
        addx    %g1,0,%g1
        addcc   %o4,%g2,%o4
        st      %o4,[%o0+8]
        retl
        addx    %g1,0,%o0

.type   bn_mul_add_words,2
.size   bn_mul_add_words,(.-bn_mul_add_words)

.align  32

.global bn_mul_words
/*
 * BN_ULONG bn_mul_words(rp,ap,num,w)
 * BN_ULONG *rp,*ap;
 * int num;
 * BN_ULONG w;
 */
bn_mul_words:
        cmp     %o2,0
        bg,a    .L_bn_mul_words_proceeed
        ld      [%o1],%g2
        retl
        clr     %o0

.L_bn_mul_words_proceeed:
        andcc   %o2,-4,%g0
        bz      .L_bn_mul_words_tail
        clr     %o5

.L_bn_mul_words_loop:
        ld      [%o1+4],%g3
        umul    %o3,%g2,%g2
        addcc   %g2,%o5,%g2
        rd      %y,%g1
        addx    %g1,0,%o5
        st      %g2,[%o0]

        ld      [%o1+8],%g2
        umul    %o3,%g3,%g3
        addcc   %g3,%o5,%g3
        rd      %y,%g1
        dec     4,%o2
        addx    %g1,0,%o5
        st      %g3,[%o0+4]

        ld      [%o1+12],%g3
        umul    %o3,%g2,%g2
        addcc   %g2,%o5,%g2
        rd      %y,%g1
        inc     16,%o1
        st      %g2,[%o0+8]
        addx    %g1,0,%o5

        umul    %o3,%g3,%g3
        addcc   %g3,%o5,%g3
        rd      %y,%g1
        inc     16,%o0
        addx    %g1,0,%o5
        st      %g3,[%o0-4]
        andcc   %o2,-4,%g0
        nop
        bnz,a   .L_bn_mul_words_loop
        ld      [%o1],%g2

        tst     %o2
        bnz,a   .L_bn_mul_words_tail
        ld      [%o1],%g2
.L_bn_mul_words_return:
        retl
        mov     %o5,%o0
        nop

.L_bn_mul_words_tail:
        umul    %o3,%g2,%g2
        addcc   %g2,%o5,%g2
        rd      %y,%g1
        addx    %g1,0,%o5
        deccc   %o2
        bz      .L_bn_mul_words_return
        st      %g2,[%o0]
        nop

        ld      [%o1+4],%g2
        umul    %o3,%g2,%g2
        addcc   %g2,%o5,%g2
        rd      %y,%g1
        addx    %g1,0,%o5
        deccc   %o2
        bz      .L_bn_mul_words_return
        st      %g2,[%o0+4]

        ld      [%o1+8],%g2
        umul    %o3,%g2,%g2
        addcc   %g2,%o5,%g2
        rd      %y,%g1
        st      %g2,[%o0+8]
        retl
        addx    %g1,0,%o0

.type   bn_mul_words,2
.size   bn_mul_words,(.-bn_mul_words)

.align  32
.global bn_sqr_words
/*
 * void bn_sqr_words(r,a,n)
 * BN_ULONG *r,*a;
 * int n;
 */
bn_sqr_words:
        cmp     %o2,0
        bg,a    .L_bn_sqr_words_proceeed
        ld      [%o1],%g2
        retl
        clr     %o0

.L_bn_sqr_words_proceeed:
        andcc   %o2,-4,%g0
        bz      .L_bn_sqr_words_tail
        clr     %o5

.L_bn_sqr_words_loop:
        ld      [%o1+4],%g3
        umul    %g2,%g2,%o4
        st      %o4,[%o0]
        rd      %y,%o5
        st      %o5,[%o0+4]

        ld      [%o1+8],%g2
        umul    %g3,%g3,%o4
        dec     4,%o2
        st      %o4,[%o0+8]
        rd      %y,%o5
        st      %o5,[%o0+12]
        nop

        ld      [%o1+12],%g3
        umul    %g2,%g2,%o4
        st      %o4,[%o0+16]
        rd      %y,%o5
        inc     16,%o1
        st      %o5,[%o0+20]

        umul    %g3,%g3,%o4
        inc     32,%o0
        st      %o4,[%o0-8]
        rd      %y,%o5
        st      %o5,[%o0-4]
        andcc   %o2,-4,%g2
        bnz,a   .L_bn_sqr_words_loop
        ld      [%o1],%g2

        tst     %o2
        nop
        bnz,a   .L_bn_sqr_words_tail
        ld      [%o1],%g2
.L_bn_sqr_words_return:
        retl
        clr     %o0

.L_bn_sqr_words_tail:
        umul    %g2,%g2,%o4
        st      %o4,[%o0]
        deccc   %o2
        rd      %y,%o5
        bz      .L_bn_sqr_words_return
        st      %o5,[%o0+4]

        ld      [%o1+4],%g2
        umul    %g2,%g2,%o4
        st      %o4,[%o0+8]
        deccc   %o2
        rd      %y,%o5
        nop
        bz      .L_bn_sqr_words_return
        st      %o5,[%o0+12]

        ld      [%o1+8],%g2
        umul    %g2,%g2,%o4
        st      %o4,[%o0+16]
        rd      %y,%o5
        st      %o5,[%o0+20]
        retl
        clr     %o0

.type   bn_sqr_words,2
.size   bn_sqr_words,(.-bn_sqr_words)

.align  32

.global bn_div_words
/*
 * BN_ULONG bn_div_words(h,l,d)
 * BN_ULONG h,l,d;
 */
bn_div_words:
        wr      %o0,%y
        udiv    %o1,%o2,%o0
        retl
        nop

.type   bn_div_words,2
.size   bn_div_words,(.-bn_div_words)

.align  32

.global bn_add_words
/*
 * BN_ULONG bn_add_words(rp,ap,bp,n)
 * BN_ULONG *rp,*ap,*bp;
 * int n;
 */
bn_add_words:
        cmp     %o3,0
        bg,a    .L_bn_add_words_proceed
        ld      [%o1],%o4
        retl
        clr     %o0

.L_bn_add_words_proceed:
        andcc   %o3,-4,%g0
        bz      .L_bn_add_words_tail
        clr     %g1
        ld      [%o2],%o5
        dec     4,%o3
        addcc   %o5,%o4,%o5
        nop
        st      %o5,[%o0]
        ba      .L_bn_add_words_warm_loop
        ld      [%o1+4],%o4
        nop

.L_bn_add_words_loop:
        ld      [%o1],%o4
        dec     4,%o3
        ld      [%o2],%o5
        addxcc  %o5,%o4,%o5
        st      %o5,[%o0]

        ld      [%o1+4],%o4
.L_bn_add_words_warm_loop:
        inc     16,%o1
        ld      [%o2+4],%o5
        addxcc  %o5,%o4,%o5
        st      %o5,[%o0+4]
        
        ld      [%o1-8],%o4
        inc     16,%o2
        ld      [%o2-8],%o5
        addxcc  %o5,%o4,%o5
        st      %o5,[%o0+8]

        ld      [%o1-4],%o4
        inc     16,%o0
        ld      [%o2-4],%o5
        addxcc  %o5,%o4,%o5
        st      %o5,[%o0-4]
        addx    %g0,0,%g1
        andcc   %o3,-4,%g0
        bnz,a   .L_bn_add_words_loop
        addcc   %g1,-1,%g0

        tst     %o3
        nop
        bnz,a   .L_bn_add_words_tail
        ld      [%o1],%o4
.L_bn_add_words_return:
        retl
        mov     %g1,%o0

.L_bn_add_words_tail:
        addcc   %g1,-1,%g0
        ld      [%o2],%o5
        addxcc  %o5,%o4,%o5
        addx    %g0,0,%g1
        deccc   %o3
        bz      .L_bn_add_words_return
        st      %o5,[%o0]
        nop

        ld      [%o1+4],%o4
        addcc   %g1,-1,%g0
        ld      [%o2+4],%o5
        addxcc  %o5,%o4,%o5
        addx    %g0,0,%g1
        deccc   %o3
        bz      .L_bn_add_words_return
        st      %o5,[%o0+4]

        ld      [%o1+8],%o4
        addcc   %g1,-1,%g0
        ld      [%o2+8],%o5
        addxcc  %o5,%o4,%o5
        st      %o5,[%o0+8]
        retl
        addx    %g0,0,%o0

.type   bn_add_words,2
.size   bn_add_words,(.-bn_add_words)

.align  32

.global bn_sub_words
/*
 * BN_ULONG bn_sub_words(rp,ap,bp,n)
 * BN_ULONG *rp,*ap,*bp;
 * int n;
 */
bn_sub_words:
        cmp     %o3,0
        bg,a    .L_bn_sub_words_proceed
        ld      [%o1],%o4
        retl
        clr     %o0

.L_bn_sub_words_proceed:
        andcc   %o3,-4,%g0
        bz      .L_bn_sub_words_tail
        clr     %g1
        ld      [%o2],%o5
        dec     4,%o3
        subcc   %o4,%o5,%o5
        nop
        st      %o5,[%o0]
        ba      .L_bn_sub_words_warm_loop
        ld      [%o1+4],%o4
        nop

.L_bn_sub_words_loop:
        ld      [%o1],%o4
        dec     4,%o3
        ld      [%o2],%o5
        subxcc  %o4,%o5,%o5
        st      %o5,[%o0]

        ld      [%o1+4],%o4
.L_bn_sub_words_warm_loop:
        inc     16,%o1
        ld      [%o2+4],%o5
        subxcc  %o4,%o5,%o5
        st      %o5,[%o0+4]
        
        ld      [%o1-8],%o4
        inc     16,%o2
        ld      [%o2-8],%o5
        subxcc  %o4,%o5,%o5
        st      %o5,[%o0+8]

        ld      [%o1-4],%o4
        inc     16,%o0
        ld      [%o2-4],%o5
        subxcc  %o4,%o5,%o5
        st      %o5,[%o0-4]
        addx    %g0,0,%g1
        andcc   %o3,-4,%g0
        bnz,a   .L_bn_sub_words_loop
        addcc   %g1,-1,%g0

        tst     %o3
        nop
        bnz,a   .L_bn_sub_words_tail
        ld      [%o1],%o4
.L_bn_sub_words_return:
        retl
        mov     %g1,%o0

.L_bn_sub_words_tail:
        addcc   %g1,-1,%g0
        ld      [%o2],%o5
        subxcc  %o4,%o5,%o5
        addx    %g0,0,%g1
        deccc   %o3
        bz      .L_bn_sub_words_return
        st      %o5,[%o0]
        nop

        ld      [%o1+4],%o4
        addcc   %g1,-1,%g0
        ld      [%o2+4],%o5
        subxcc  %o4,%o5,%o5
        addx    %g0,0,%g1
        deccc   %o3
        bz      .L_bn_sub_words_return
        st      %o5,[%o0+4]

        ld      [%o1+8],%o4
        addcc   %g1,-1,%g0
        ld      [%o2+8],%o5
        subxcc  %o4,%o5,%o5
        st      %o5,[%o0+8]
        retl
        addx    %g0,0,%o0

.type   bn_sub_words,2
.size   bn_sub_words,(.-bn_sub_words)

#define FRAME_SIZE -96

/*
 * Here is register usage map for *all* routines below.
 */
#define a_0     %l0
#define a_0_    [%i1]
#define a_1     %l1
#define a_1_    [%i1+4]
#define a_2     %l2
#define a_2_    [%i1+8]
#define a_3     %l3
#define a_3_    [%i1+12]
#define a_4     %l4
#define a_4_    [%i1+16]
#define a_5     %l5
#define a_5_    [%i1+20]
#define a_6     %l6
#define a_6_    [%i1+24]
#define a_7     %l7
#define a_7_    [%i1+28]
#define b_0     %g1
#define b_0_    [%i2]
#define b_1     %g2
#define b_1_    [%i2+4]
#define b_2     %g3
#define b_2_    [%i2+8]
#define b_3     %g4
#define b_3_    [%i2+12]
#define b_4     %i3
#define b_4_    [%i2+16]
#define b_5     %i4
#define b_5_    [%i2+20]
#define b_6     %i5
#define b_6_    [%i2+24]
#define b_7     %o5
#define b_7_    [%i2+28]
#define c_1     %o2
#define c_2     %o3
#define c_3     %o4
#define t_1     %o0
#define t_2     %o1

.align  32
.global bn_mul_comba8
/*
 * void bn_mul_comba8(r,a,b)
 * BN_ULONG *r,*a,*b;
 */
bn_mul_comba8:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      b_0_,b_0
        umul    a_0,b_0,c_1     !=!mul_add_c(a[0],b[0],c1,c2,c3);
        ld      b_1_,b_1
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        umul    a_0,b_1,t_1     !=!mul_add_c(a[0],b[1],c2,c3,c1);
        ld      a_1_,a_1
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3     !=
        addx    %g0,%g0,c_1
        ld      a_2_,a_2
        umul    a_1,b_0,t_1     !mul_add_c(a[1],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+4]     !r[1]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        ld      b_2_,b_2
        umul    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        ld      b_3_,b_3
        addx    c_2,%g0,c_2     !=
        umul    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        ld      a_3_,a_3
        umul    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        ld      a_4_,a_4
        umul    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        ld      b_4_,b_4
        umul    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        ld      b_5_,b_5
        umul    a_0,b_4,t_1     !=!mul_add_c(a[0],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_2,b_3,t_1     !=!mul_add_c(a[2],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        ld      a_5_,a_5
        umul    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        ld      a_6_,a_6
        addx    c_2,%g0,c_2     !=
        umul    a_5,b_0,t_1     !mul_add_c(a[5],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+20]    !r[5]=c3;

        umul    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,b_2,t_1     !mul_add_c(a[4],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        umul    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_2,b_4,t_1     !mul_add_c(a[2],b[4],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        ld      b_6_,b_6
        addx    c_3,%g0,c_3     !=
        umul    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        ld      b_7_,b_7
        umul    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        st      c_1,[%i0+24]    !r[6]=c1;
        addx    c_3,%g0,c_3     !=

        umul    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    %g0,%g0,c_1
        umul    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_3,b_4,t_1     !=!mul_add_c(a[3],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        ld      a_7_,a_7
        umul    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_7,b_0,t_1     !mul_add_c(a[7],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        st      c_2,[%i0+28]    !r[7]=c2;

        umul    a_7,b_1,t_1     !mul_add_c(a[7],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_6,b_2,t_1     !=!mul_add_c(a[6],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        umul    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_2,b_6,t_1     !=!mul_add_c(a[2],b[6],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !
        addx    c_2,%g0,c_2
        st      c_3,[%i0+32]    !r[8]=c3;

        umul    a_2,b_7,t_1     !mul_add_c(a[2],b[7],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        umul    a_3,b_6,t_1     !=!mul_add_c(a[3],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        umul    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_7,b_2,t_1     !=!mul_add_c(a[7],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+36]    !r[9]=c1;

        umul    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_5,b_5,t_1     !=!mul_add_c(a[5],b[5],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        umul    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+40]    !r[10]=c2;

        umul    a_4,b_7,t_1     !=!mul_add_c(a[4],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        umul    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        umul    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+44]    !r[11]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        umul    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2          !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        st      c_1,[%i0+48]    !r[12]=c1;
        addx    c_3,%g0,c_3     !=

        umul    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3     !=
        addx    %g0,%g0,c_1
        umul    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+52]    !r[13]=c2;

        umul    a_7,b_7,t_1     !=!mul_add_c(a[7],b[7],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        nop                     !=
        st      c_3,[%i0+56]    !r[14]=c3;
        st      c_1,[%i0+60]    !r[15]=c1;

        ret
        restore %g0,%g0,%o0

.type   bn_mul_comba8,2
.size   bn_mul_comba8,(.-bn_mul_comba8)

.align  32

.global bn_mul_comba4
/*
 * void bn_mul_comba4(r,a,b)
 * BN_ULONG *r,*a,*b;
 */
bn_mul_comba4:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      b_0_,b_0
        umul    a_0,b_0,c_1     !=!mul_add_c(a[0],b[0],c1,c2,c3);
        ld      b_1_,b_1
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        umul    a_0,b_1,t_1     !=!mul_add_c(a[0],b[1],c2,c3,c1);
        ld      a_1_,a_1
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1
        ld      a_2_,a_2
        umul    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+4]     !r[1]=c2;

        umul    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        ld      b_2_,b_2
        umul    a_1,b_1,t_1     !=!mul_add_c(a[1],b[1],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      b_3_,b_3
        umul    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,b_3,t_1     !=!mul_add_c(a[0],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3     !=
        umul    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        ld      a_3_,a_3
        umul    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        umul    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        umul    a_1,b_3,t_1     !=!mul_add_c(a[1],b[3],c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        umul    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+20]    !r[5]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        st      c_1,[%i0+24]    !r[6]=c1;
        st      c_2,[%i0+28]    !r[7]=c2;
        
        ret
        restore %g0,%g0,%o0

.type   bn_mul_comba4,2
.size   bn_mul_comba4,(.-bn_mul_comba4)

.align  32

.global bn_sqr_comba8
bn_sqr_comba8:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        ld      a_1_,a_1
        umul    a_0,a_0,c_1     !=!sqr_add_c(a,0,c1,c2,c3);
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        ld      a_2_,a_2
        umul    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1     !=
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+4]     !r[1]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      a_3_,a_3
        umul    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        st      c_3,[%i0+8]     !r[2]=c3;

        umul    a_0,a_3,t_1     !=!sqr_add_c2(a,3,0,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3     !=
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        ld      a_4_,a_4
        addx    c_3,%g0,c_3     !=
        umul    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        ld      a_5_,a_5
        umul    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+16]    !r[4]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        umul    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        ld      a_6_,a_6
        umul    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        st      c_3,[%i0+20]    !r[5]=c3;

        umul    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        umul    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
        addcc   c_1,t_1,c_1     !=
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1     !=
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3
        ld      a_7_,a_7
        umul    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+24]    !r[6]=c1;

        umul    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        st      c_2,[%i0+28]    !r[7]=c2;

        umul    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        addcc   c_3,t_1,c_3     !=
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+32]    !r[8]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+36]    !r[9]=c1;

        umul    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+40]    !r[10]=c2;

        umul    a_4,a_7,t_1     !=!sqr_add_c2(a,7,4,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2
        umul    a_5,a_6,t_1     !=!sqr_add_c2(a,6,5,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    c_2,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+44]    !r[11]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        st      c_1,[%i0+48]    !r[12]=c1;

        umul    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2     !=
        rd      %y,t_2
        addxcc  c_3,t_2,c_3
        st      c_2,[%i0+52]    !r[13]=c2;
        addx    c_1,%g0,c_1     !=

        umul    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1     !=
        st      c_3,[%i0+56]    !r[14]=c3;
        st      c_1,[%i0+60]    !r[15]=c1;

        ret
        restore %g0,%g0,%o0

.type   bn_sqr_comba8,2
.size   bn_sqr_comba8,(.-bn_sqr_comba8)

.align  32

.global bn_sqr_comba4
/*
 * void bn_sqr_comba4(r,a)
 * BN_ULONG *r,*a;
 */
bn_sqr_comba4:
        save    %sp,FRAME_SIZE,%sp
        ld      a_0_,a_0
        umul    a_0,a_0,c_1     !sqr_add_c(a,0,c1,c2,c3);
        ld      a_1_,a_1        !=
        rd      %y,c_2
        st      c_1,[%i0]       !r[0]=c1;

        ld      a_1_,a_1
        umul    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2
        addxcc  %g0,t_2,c_3
        addx    %g0,%g0,c_1     !=
        ld      a_2_,a_2
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1     !=
        st      c_2,[%i0+4]     !r[1]=c2;

        umul    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2          !=
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1     !=
        addx    c_2,%g0,c_2
        ld      a_3_,a_3
        umul    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
        addcc   c_3,t_1,c_3     !=
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+8]     !r[2]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    %g0,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        umul    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        addx    c_3,%g0,c_3
        addcc   c_1,t_1,c_1
        addxcc  c_2,t_2,c_2
        addx    c_3,%g0,c_3     !=
        st      c_1,[%i0+12]    !r[3]=c1;

        umul    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    %g0,%g0,c_1
        addcc   c_2,t_1,c_2
        addxcc  c_3,t_2,c_3     !=
        addx    c_1,%g0,c_1
        umul    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
        addcc   c_2,t_1,c_2
        rd      %y,t_2          !=
        addxcc  c_3,t_2,c_3
        addx    c_1,%g0,c_1
        st      c_2,[%i0+16]    !r[4]=c2;

        umul    a_2,a_3,t_1     !=!sqr_add_c2(a,3,2,c3,c1,c2);
        addcc   c_3,t_1,c_3
        rd      %y,t_2
        addxcc  c_1,t_2,c_1
        addx    %g0,%g0,c_2     !=
        addcc   c_3,t_1,c_3
        addxcc  c_1,t_2,c_1
        st      c_3,[%i0+20]    !r[5]=c3;
        addx    c_2,%g0,c_2     !=

        umul    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
        addcc   c_1,t_1,c_1
        rd      %y,t_2
        addxcc  c_2,t_2,c_2     !=
        st      c_1,[%i0+24]    !r[6]=c1;
        st      c_2,[%i0+28]    !r[7]=c2;
        
        ret
        restore %g0,%g0,%o0

.type   bn_sqr_comba4,2
.size   bn_sqr_comba4,(.-bn_sqr_comba4)
.align  32

Reply via email to