This patch is a faster bn_mul_add_words for x86 assembly.
For example, rsa 1024 bits sign/s improved about 4.7%.
This patch is based on code written by Sean Stanek.

Output from "openssl speed rsa" on an 800 MHz AMD Athlon:

Before patch:
                  sign    verify    sign/s verify/s
rsa  512 bits   0.0017s   0.0001s    594.6   7578.4
rsa 1024 bits   0.0076s   0.0004s    131.1   2731.0
rsa 2048 bits   0.0427s   0.0012s     23.4    843.2
rsa 4096 bits   0.2759s   0.0041s      3.6    242.2

After patch:
                  sign    verify    sign/s verify/s
rsa  512 bits   0.0016s   0.0001s    615.5   7936.6
rsa 1024 bits   0.0073s   0.0003s    137.3   2891.8
rsa 2048 bits   0.0405s   0.0011s     24.7    893.0
rsa 4096 bits   0.2590s   0.0039s      3.9    257.6


Index: crypto/bn/asm/bn-586.pl
===================================================================
RCS file: /home/paul/openssl/rsync/cvs/openssl/crypto/bn/asm/bn-586.pl,v
retrieving revision 1.5
diff -u -r1.5 bn-586.pl
--- crypto/bn/asm/bn-586.pl     2000/12/06 16:30:23     1.5
+++ crypto/bn/asm/bn-586.pl     2001/08/24 21:01:45
@@ -22,78 +22,111 @@
        &function_begin($name,"");
 
        &comment("");
-       $Low="eax";
-       $High="edx";
-       $a="ebx";
-       $w="ebp";
-       $r="edi";
-       $c="esi";
-
-       &xor($c,$c);            # clear carry
-       &mov($r,&wparam(0));    #
-
-       &mov("ecx",&wparam(2)); #
-       &mov($a,&wparam(1));    #
-
-       &and("ecx",0xfffffff8); # num / 8
-       &mov($w,&wparam(3));    #
-
-       &push("ecx");           # Up the stack for a tmp variable
-
-       &jz(&label("maw_finish"));
-
-       &set_label("maw_loop",0);
-
-       &mov(&swtmp(0),"ecx");  #
-
-       for ($i=0; $i<32; $i+=4)
-               {
-               &comment("Round $i");
-
-                &mov("eax",&DWP($i,$a,"",0));  # *a
-               &mul($w);                       # *a * w
-               &add("eax",$c);         # L(t)+= *r
-                &mov($c,&DWP($i,$r,"",0));     # L(t)+= *r
-               &adc("edx",0);                  # H(t)+=carry
-                &add("eax",$c);                # L(t)+=c
-               &adc("edx",0);                  # H(t)+=carry
-                &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
-               &mov($c,"edx");                 # c=  H(t);
-               }
-
-       &comment("");
-       &mov("ecx",&swtmp(0));  #
-       &add($a,32);
-       &add($r,32);
-       &sub("ecx",8);
-       &jnz(&label("maw_loop"));
-
-       &set_label("maw_finish",0);
-       &mov("ecx",&wparam(2)); # get num
-       &and("ecx",7);
-       &jnz(&label("maw_finish2"));    # helps branch prediction
-       &jmp(&label("maw_end"));
-
-       &set_label("maw_finish2",1);
-       for ($i=0; $i<7; $i++)
-               {
-               &comment("Tail Round $i");
-                &mov("eax",&DWP($i*4,$a,"",0));# *a
-               &mul($w);                       # *a * w
-               &add("eax",$c);                 # L(t)+=c
-                &mov($c,&DWP($i*4,$r,"",0));   # L(t)+= *r
-               &adc("edx",0);                  # H(t)+=carry
-                &add("eax",$c);
-               &adc("edx",0);                  # H(t)+=carry
-                &dec("ecx") if ($i != 7-1);
-               &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
-                &mov($c,"edx");                        # c=  H(t);
-               &jz(&label("maw_end")) if ($i != 7-1);
-               }
-       &set_label("maw_end",0);
-       &mov("eax",$c);
+       $r = 0;
+       $a = 1;
+       $num = 2;
+       $w = 3;
+
+       &mov("ebx",&wparam($r));        # r
+       &mov("ebp",&wparam($a));        # a
+       &mov("ecx",&wparam($num));      # num
+
+       &shr("ecx",3);          # num / 8
+       &mov("edi",0);          # clear carry
+       &jz(&label("bn_mul_add_words_part"));
+
+       &set_label("bn_mul_add_words_full_loop",0);
+       &comment("8 dwords per loop (2 dwords per round * 4 rounds per
loop)");
+
+       &push("ecx");
+       &mov("ecx",&wparam($w));        # w
+
+       for ($k=0; $k<4; $k++)
+       {
+           &comment("Round $k");
+           $i = $k * 8;
+           &mov("eax",&DWP($i,"ebp","",0));
+           &mov("esi",&DWP($i+4,"ebp","",0));
+           &mul("ecx");
+           &add("eax","edi");
+           &mov("edi","edx");
+           &adc("edi",0);
+           &add(&DWP($i,"ebx","",0),"eax");
+           &mov("eax","esi");
+           &adc("edi",0);
+           &mul("ecx");
+           &add("eax","edi");
+           &mov("edi","edx");
+           &adc("edi",0);
+           &add(&DWP($i+4,"ebx","",0),"eax");
+           &adc("edi",0);
+       }
+       
+       &pop("ecx");
+        &add("ebp",32);
+        &add("ebx",32);
+        &dec("ecx");
+       &jnz(&label("bn_mul_add_words_full_loop"));
+
+       &set_label("bn_mul_add_words_part",0);
+       &mov("esi",&wparam($num)); # num
+        &and("esi",7);
+        &mov("ecx",&wparam($w)); # w
+
+       &comment("do 0..7 more dwords");
+        &jmp(&DWP(&label("bn_mul_add_words_array"),"","esi",4));
+
+       &set_label("bn_mul_add_words_array",1);
+       &comment("array for quick jump for last 0..7 dwords");
+       &data_word(&label("bn_mul_add_words_end"));
+       for ($i=1; $i<=7; $i++)
+       {
+           &data_word(&label("bn_mul_add_words_part_$i"));
+       }
+
+       for ($j=7; $j>=1; $j--)
+       {
+           &set_label("bn_mul_add_words_part_$j",1);
+           for ($k=0; $k<($j>>1); $k++)
+           {
+               &comment("Part $j, Round $k");
+               $i = $k * 8;
+               &mov("eax",&DWP($i,"ebp","",0));
+               &mov("esi",&DWP($i+4,"ebp","",0));
+               &mul("ecx");
+               &add("eax","edi");
+               &mov("edi","edx");
+               &adc("edi",0);
+               &add(&DWP($i,"ebx","",0),"eax");
+               &mov("eax","esi");
+               &adc("edi",0);
+               &mul("ecx");
+               &add("eax","edi");
+               &mov("edi","edx");
+               &adc("edi",0);
+               &add(&DWP($i+4,"ebx","",0),"eax");
+               &adc("edi",0);
+           }
+
+           if (($j % 2) == 1)
+           {
+               &comment("Part $j, one more dword");
+               $i = $k * 8;
+               &mov("eax",&DWP($i,"ebp","",0));
+               &mul("ecx");
+               &add("eax","edi");
+               &mov("edi","edx");
+               &adc("edi",0);
+               &add(&DWP($i,"ebx","",0),"eax");
+               &adc("edi",0);
+           }
 
-       &pop("ecx");    # clear variable from
+           &jmp(&label("bn_mul_add_words_end")) if ($j != 1);
+       }
+
+       &set_label("bn_mul_add_words_end",0);
+
+       &mov("eax","edi");
 
        &function_end($name);
        }

______________________________________________________________________
OpenSSL Project                                 http://www.openssl.org
Development Mailing List                       [EMAIL PROTECTED]
Automated List Manager                           [EMAIL PROTECTED]

Reply via email to