This patch is a faster bn_mul_add_words for x86 assembly. For example, rsa 1024 bits sign/s improved about 4.7%. This patch is based on code written by Sean Stanek. Output from "openssl speed rsa" on an 800 MHz AMD Athlon: Before patch: sign verify sign/s verify/s rsa 512 bits 0.0017s 0.0001s 594.6 7578.4 rsa 1024 bits 0.0076s 0.0004s 131.1 2731.0 rsa 2048 bits 0.0427s 0.0012s 23.4 843.2 rsa 4096 bits 0.2759s 0.0041s 3.6 242.2 After patch: sign verify sign/s verify/s rsa 512 bits 0.0016s 0.0001s 615.5 7936.6 rsa 1024 bits 0.0073s 0.0003s 137.3 2891.8 rsa 2048 bits 0.0405s 0.0011s 24.7 893.0 rsa 4096 bits 0.2590s 0.0039s 3.9 257.6 Index: crypto/bn/asm/bn-586.pl =================================================================== RCS file: /home/paul/openssl/rsync/cvs/openssl/crypto/bn/asm/bn-586.pl,v retrieving revision 1.5 diff -u -r1.5 bn-586.pl --- crypto/bn/asm/bn-586.pl 2000/12/06 16:30:23 1.5 +++ crypto/bn/asm/bn-586.pl 2001/08/24 21:01:45 @@ -22,78 +22,111 @@ &function_begin($name,""); &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ebp"; - $r="edi"; - $c="esi"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - - &mov("ecx",&wparam(2)); # - &mov($a,&wparam(1)); # - - &and("ecx",0xfffffff8); # num / 8 - &mov($w,&wparam(3)); # - - &push("ecx"); # Up the stack for a tmp variable - - &jz(&label("maw_finish")); - - &set_label("maw_loop",0); - - &mov(&swtmp(0),"ecx"); # - - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - - &mov("eax",&DWP($i,$a,"",0)); # *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+= *r - &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); # L(t)+=c - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - } - - &comment(""); - &mov("ecx",&swtmp(0)); # - &add($a,32); - &add($r,32); - &sub("ecx",8); - &jnz(&label("maw_loop")); - - &set_label("maw_finish",0); - &mov("ecx",&wparam(2)); # get num - &and("ecx",7); - &jnz(&label("maw_finish2")); # helps branch prediction - &jmp(&label("maw_end")); - - &set_label("maw_finish2",1); - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); - &adc("edx",0); # H(t)+=carry - &dec("ecx") if ($i != 7-1); - &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - &jz(&label("maw_end")) if ($i != 7-1); - } - &set_label("maw_end",0); - &mov("eax",$c); + $r = 0; + $a = 1; + $num = 2; + $w = 3; + + &mov("ebx",&wparam($r)); # r + &mov("ebp",&wparam($a)); # a + &mov("ecx",&wparam($num)); # num + + &shr("ecx",3); # num / 8 + &mov("edi",0); # clear carry + &jz(&label("bn_mul_add_words_part")); + + &set_label("bn_mul_add_words_full_loop",0); + &comment("8 dwords per loop (2 dwords per round * 4 rounds per loop)"); + + &push("ecx"); + &mov("ecx",&wparam($w)); # w + + for ($k=0; $k<4; $k++) + { + &comment("Round $k"); + $i = $k * 8; + &mov("eax",&DWP($i,"ebp","",0)); + &mov("esi",&DWP($i+4,"ebp","",0)); + &mul("ecx"); + &add("eax","edi"); + &mov("edi","edx"); + &adc("edi",0); + &add(&DWP($i,"ebx","",0),"eax"); + &mov("eax","esi"); + &adc("edi",0); + &mul("ecx"); + &add("eax","edi"); + &mov("edi","edx"); + &adc("edi",0); + &add(&DWP($i+4,"ebx","",0),"eax"); + &adc("edi",0); + } + + &pop("ecx"); + &add("ebp",32); + &add("ebx",32); + &dec("ecx"); + &jnz(&label("bn_mul_add_words_full_loop")); + + &set_label("bn_mul_add_words_part",0); + &mov("esi",&wparam($num)); # num + &and("esi",7); + &mov("ecx",&wparam($w)); # w + + &comment("do 0..7 more dwords"); + &jmp(&DWP(&label("bn_mul_add_words_array"),"","esi",4)); + + &set_label("bn_mul_add_words_array",1); + &comment("array for quick jump for last 0..7 dwords"); + &data_word(&label("bn_mul_add_words_end")); + for ($i=1; $i<=7; $i++) + { + &data_word(&label("bn_mul_add_words_part_$i")); + } + + for ($j=7; $j>=1; $j--) + { + &set_label("bn_mul_add_words_part_$j",1); + for ($k=0; $k<($j>>1); $k++) + { + &comment("Part $j, Round $k"); + $i = $k * 8; + &mov("eax",&DWP($i,"ebp","",0)); + &mov("esi",&DWP($i+4,"ebp","",0)); + &mul("ecx"); + &add("eax","edi"); + &mov("edi","edx"); + &adc("edi",0); + &add(&DWP($i,"ebx","",0),"eax"); + &mov("eax","esi"); + &adc("edi",0); + &mul("ecx"); + &add("eax","edi"); + &mov("edi","edx"); + &adc("edi",0); + &add(&DWP($i+4,"ebx","",0),"eax"); + &adc("edi",0); + } + + if (($j % 2) == 1) + { + &comment("Part $j, one more dword"); + $i = $k * 8; + &mov("eax",&DWP($i,"ebp","",0)); + &mul("ecx"); + &add("eax","edi"); + &mov("edi","edx"); + &adc("edi",0); + &add(&DWP($i,"ebx","",0),"eax"); + &adc("edi",0); + } - &pop("ecx"); # clear variable from + &jmp(&label("bn_mul_add_words_end")) if ($j != 1); + } + + &set_label("bn_mul_add_words_end",0); + + &mov("eax","edi"); &function_end($name); } ______________________________________________________________________ OpenSSL Project http://www.openssl.org Development Mailing List [EMAIL PROTECTED] Automated List Manager [EMAIL PROTECTED]