diff options
Diffstat (limited to 'openssl/crypto/bn/asm')
-rw-r--r-- | openssl/crypto/bn/asm/armv4-mont.pl | 1 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/bn-586.pl | 203 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/co-586.pl | 3 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/mo-586.pl | 603 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/ppc.pl | 233 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/sparcv8plus.S | 15 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/x86_64-gcc.c | 29 | ||||
-rw-r--r-- | openssl/crypto/bn/asm/x86_64-mont.pl | 136 |
8 files changed, 380 insertions, 843 deletions
diff --git a/openssl/crypto/bn/asm/armv4-mont.pl b/openssl/crypto/bn/asm/armv4-mont.pl index 05d5dc1a4..14e0d2d1d 100644 --- a/openssl/crypto/bn/asm/armv4-mont.pl +++ b/openssl/crypto/bn/asm/armv4-mont.pl @@ -193,6 +193,7 @@ bn_mul_mont: bx lr @ interoperable with Thumb ISA:-) .size bn_mul_mont,.-bn_mul_mont .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 diff --git a/openssl/crypto/bn/asm/bn-586.pl b/openssl/crypto/bn/asm/bn-586.pl index 26c2685a7..332ef3e91 100644 --- a/openssl/crypto/bn/asm/bn-586.pl +++ b/openssl/crypto/bn/asm/bn-586.pl @@ -1,6 +1,7 @@ #!/usr/local/bin/perl -push(@INC,"perlasm","../../perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],$0); @@ -24,38 +25,25 @@ sub bn_mul_add_words { local($name)=@_; - &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); + &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); - &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ebp"; - $r="edi"; - $c="esi"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - - &mov("ecx",&wparam(2)); # - &mov($a,&wparam(1)); # - - &and("ecx",0xfffffff8); # num / 8 - &mov($w,&wparam(3)); # - - &push("ecx"); # Up the stack for a tmp variable - - &jz(&label("maw_finish")); + $r="eax"; + $a="edx"; + $c="ecx"; if ($sse2) { &picmeup("eax","OPENSSL_ia32cap_P"); &bt(&DWP(0,"eax"),26); - &jnc(&label("maw_loop")); + &jnc(&label("maw_non_sse2")); - &movd("mm0",$w); # mm0 = w + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &mov($c,&wparam(2)); + &movd("mm0",&wparam(3)); # mm0 = w &pxor("mm1","mm1"); # mm1 = carry_in - - &set_label("maw_sse2_loop",0); + &jmp(&label("maw_sse2_entry")); + + &set_label("maw_sse2_unrolled",16); &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] &paddq("mm1","mm3"); # mm1 = carry_in + r[0] &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] @@ -112,42 +100,82 @@ sub bn_mul_add_words &psrlq("mm1",32); # mm1 = carry6 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] &movd(&DWP(28,$r,"",0),"mm1"); - &add($r,32); + &lea($r,&DWP(32,$r)); &psrlq("mm1",32); # mm1 = carry_out - &sub("ecx",8); + &sub($c,8); + &jz(&label("maw_sse2_exit")); + &set_label("maw_sse2_entry"); + &test($c,0xfffffff8); + &jnz(&label("maw_sse2_unrolled")); + + &set_label("maw_sse2_loop",4); + &movd("mm2",&DWP(0,$a)); # mm2 = a[i] + &movd("mm3",&DWP(0,$r)); # mm3 = r[i] + &pmuludq("mm2","mm0"); # a[i] *= w + &lea($a,&DWP(4,$a)); + &paddq("mm1","mm3"); # carry += r[i] + &paddq("mm1","mm2"); # carry += a[i]*w + &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low + &sub($c,1); + &psrlq("mm1",32); # carry = carry_high + &lea($r,&DWP(4,$r)); &jnz(&label("maw_sse2_loop")); - - &movd($c,"mm1"); # c = carry_out + &set_label("maw_sse2_exit"); + &movd("eax","mm1"); # c = carry_out &emms(); + &ret(); - &jmp(&label("maw_finish")); + &set_label("maw_non_sse2",16); } - &set_label("maw_loop",0); + # function_begin prologue + &push("ebp"); + &push("ebx"); + &push("esi"); + &push("edi"); + + &comment(""); + $Low="eax"; + $High="edx"; + $a="ebx"; + $w="ebp"; + $r="edi"; + $c="esi"; + + &xor($c,$c); # clear carry + &mov($r,&wparam(0)); # + + &mov("ecx",&wparam(2)); # + &mov($a,&wparam(1)); # + + &and("ecx",0xfffffff8); # num / 8 + &mov($w,&wparam(3)); # - &mov(&swtmp(0),"ecx"); # + &push("ecx"); # Up the stack for a tmp variable + + &jz(&label("maw_finish")); + + &set_label("maw_loop",16); for ($i=0; $i<32; $i+=4) { &comment("Round $i"); - &mov("eax",&DWP($i,$a,"",0)); # *a + &mov("eax",&DWP($i,$a)); # *a &mul($w); # *a * w - &add("eax",$c); # L(t)+= *r - &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r + &add("eax",$c); # L(t)+= c &adc("edx",0); # H(t)+=carry - &add("eax",$c); # L(t)+=c + &add("eax",&DWP($i,$r)); # L(t)+= *r &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); + &mov(&DWP($i,$r),"eax"); # *r= L(t); &mov($c,"edx"); # c= H(t); } &comment(""); - &mov("ecx",&swtmp(0)); # - &add($a,32); - &add($r,32); &sub("ecx",8); + &lea($a,&DWP(32,$a)); + &lea($r,&DWP(32,$r)); &jnz(&label("maw_loop")); &set_label("maw_finish",0); @@ -160,16 +188,15 @@ sub bn_mul_add_words for ($i=0; $i<7; $i++) { &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a + &mov("eax",&DWP($i*4,$a)); # *a &mul($w); # *a * w &add("eax",$c); # L(t)+=c - &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r &adc("edx",0); # H(t)+=carry - &add("eax",$c); + &add("eax",&DWP($i*4,$r)); # L(t)+= *r &adc("edx",0); # H(t)+=carry &dec("ecx") if ($i != 7-1); - &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); + &mov(&DWP($i*4,$r),"eax"); # *r= L(t); + &mov($c,"edx"); # c= H(t); &jz(&label("maw_end")) if ($i != 7-1); } &set_label("maw_end",0); @@ -184,7 +211,45 @@ sub bn_mul_words { local($name)=@_; - &function_begin($name,""); + &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); + + $r="eax"; + $a="edx"; + $c="ecx"; + + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt(&DWP(0,"eax"),26); + &jnc(&label("mw_non_sse2")); + + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &mov($c,&wparam(2)); + &movd("mm0",&wparam(3)); # mm0 = w + &pxor("mm1","mm1"); # mm1 = carry = 0 + + &set_label("mw_sse2_loop",16); + &movd("mm2",&DWP(0,$a)); # mm2 = a[i] + &pmuludq("mm2","mm0"); # a[i] *= w + &lea($a,&DWP(4,$a)); + &paddq("mm1","mm2"); # carry += a[i]*w + &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low + &sub($c,1); + &psrlq("mm1",32); # carry = carry_high + &lea($r,&DWP(4,$r)); + &jnz(&label("mw_sse2_loop")); + + &movd("eax","mm1"); # return carry + &emms(); + &ret(); + &set_label("mw_non_sse2",16); + } + + # function_begin prologue + &push("ebp"); + &push("ebx"); + &push("esi"); + &push("edi"); &comment(""); $Low="eax"; @@ -257,7 +322,40 @@ sub bn_sqr_words { local($name)=@_; - &function_begin($name,""); + &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); + + $r="eax"; + $a="edx"; + $c="ecx"; + + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt(&DWP(0,"eax"),26); + &jnc(&label("sqr_non_sse2")); + + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &mov($c,&wparam(2)); + + &set_label("sqr_sse2_loop",16); + &movd("mm0",&DWP(0,$a)); # mm0 = a[i] + &pmuludq("mm0","mm0"); # a[i] *= a[i] + &lea($a,&DWP(4,$a)); # a++ + &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] + &sub($c,1); + &lea($r,&DWP(8,$r)); # r += 2 + &jnz(&label("sqr_sse2_loop")); + + &emms(); + &ret(); + &set_label("sqr_non_sse2",16); + } + + # function_begin prologue + &push("ebp"); + &push("ebx"); + &push("esi"); + &push("edi"); &comment(""); $r="esi"; @@ -313,12 +411,13 @@ sub bn_div_words { local($name)=@_; - &function_begin($name,""); + &function_begin_B($name,""); &mov("edx",&wparam(0)); # &mov("eax",&wparam(1)); # - &mov("ebx",&wparam(2)); # - &div("ebx"); - &function_end($name); + &mov("ecx",&wparam(2)); # + &div("ecx"); + &ret(); + &function_end_B($name); } sub bn_add_words diff --git a/openssl/crypto/bn/asm/co-586.pl b/openssl/crypto/bn/asm/co-586.pl index 5d962cb95..57101a6bd 100644 --- a/openssl/crypto/bn/asm/co-586.pl +++ b/openssl/crypto/bn/asm/co-586.pl @@ -1,6 +1,7 @@ #!/usr/local/bin/perl -push(@INC,"perlasm","../../perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],$0); diff --git a/openssl/crypto/bn/asm/mo-586.pl b/openssl/crypto/bn/asm/mo-586.pl deleted file mode 100644 index 098229309..000000000 --- a/openssl/crypto/bn/asm/mo-586.pl +++ /dev/null @@ -1,603 +0,0 @@ -#!/usr/bin/env perl - -# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl) -# from OpenSSL 0.9.9-dev - -sub ::asciz -{ my @str=unpack("C*",shift); - push @str,0; - while ($#str>15) { - &data_byte(@str[0..15]); - foreach (0..15) { shift @str; } - } - &data_byte(@str) if (@str); -} - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# October 2005 -# -# This is a "teaser" code, as it can be improved in several ways... -# First of all non-SSE2 path should be implemented (yes, for now it -# performs Montgomery multiplication/convolution only on SSE2-capable -# CPUs such as P4, others fall down to original code). Then inner loop -# can be unrolled and modulo-scheduled to improve ILP and possibly -# moved to 128-bit XMM register bank (though it would require input -# rearrangement and/or increase bus bandwidth utilization). Dedicated -# squaring procedure should give further performance improvement... -# Yet, for being draft, the code improves rsa512 *sign* benchmark by -# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) - -# December 2006 -# -# Modulo-scheduling SSE2 loops results in further 15-20% improvement. -# Integer-only code [being equipped with dedicated squaring procedure] -# gives ~40% on rsa512 sign benchmark... - -push(@INC,"perlasm","../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -$sse2=0; -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -&external_label("OPENSSL_ia32cap_P") if ($sse2); - -&function_begin("bn_mul_mont"); - -$i="edx"; -$j="ecx"; -$ap="esi"; $tp="esi"; # overlapping variables!!! -$rp="edi"; $bp="edi"; # overlapping variables!!! -$np="ebp"; -$num="ebx"; - -$_num=&DWP(4*0,"esp"); # stack top layout -$_rp=&DWP(4*1,"esp"); -$_ap=&DWP(4*2,"esp"); -$_bp=&DWP(4*3,"esp"); -$_np=&DWP(4*4,"esp"); -$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); -$_sp=&DWP(4*6,"esp"); -$_bpend=&DWP(4*7,"esp"); -$frame=32; # size of above frame rounded up to 16n - - &xor ("eax","eax"); - &mov ("edi",&wparam(5)); # int num - &cmp ("edi",4); - &jl (&label("just_leave")); - - &lea ("esi",&wparam(0)); # put aside pointer to argument block - &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! - &add ("edi",2); # extra two words on top of tp - &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) - &neg ("edi"); - - # minimize cache contention by arraning 2K window between stack - # pointer and ap argument [np is also position sensitive vector, - # but it's assumed to be near ap, as it's allocated at ~same - # time]. - &mov ("eax","esp"); - &sub ("eax","edx"); - &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 - - &xor ("edx","esp"); - &and ("edx",2048); - &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 - - &and ("esp",-64); # align to cache line - - ################################# load argument block... - &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp - &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap - &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np - &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 - #&mov ("edi",&DWP(5*4,"esi"));# int num - - &mov ("esi",&DWP(0,"esi")); # pull n0[0] - &mov ($_rp,"eax"); # ... save a copy of argument block - &mov ($_ap,"ebx"); - &mov ($_bp,"ecx"); - &mov ($_np,"edx"); - &mov ($_n0,"esi"); - &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling - #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! - -if($sse2) { -$acc0="mm0"; # mmx register bank layout -$acc1="mm1"; -$car0="mm2"; -$car1="mm3"; -$mul0="mm4"; -$mul1="mm5"; -$temp="mm6"; -$mask="mm7"; - - &picmeup("eax","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"eax"),26); - &jnc (&label("non_sse2")); - - &mov ("eax",-1); - &movd ($mask,"eax"); # mask 32 lower bits - - &mov ($ap,$_ap); # load input pointers - &mov ($bp,$_bp); - &mov ($np,$_np); - - &xor ($i,$i); # i=0 - &xor ($j,$j); # j=0 - - &movd ($mul0,&DWP(0,$bp)); # bp[0] - &movd ($mul1,&DWP(0,$ap)); # ap[0] - &movd ($car1,&DWP(0,$np)); # np[0] - - &pmuludq($mul1,$mul0); # ap[0]*bp[0] - &movq ($car0,$mul1); - &movq ($acc0,$mul1); # I wish movd worked for - &pand ($acc0,$mask); # inter-register transfers - - &pmuludq($mul1,$_n0q); # *=n0 - - &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 - &paddq ($car1,$acc0); - - &movd ($acc1,&DWP(4,$np)); # np[1] - &movd ($acc0,&DWP(4,$ap)); # ap[1] - - &psrlq ($car0,32); - &psrlq ($car1,32); - - &inc ($j); # j++ -&set_label("1st",16); - &pmuludq($acc0,$mul0); # ap[j]*bp[0] - &pmuludq($acc1,$mul1); # np[j]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] - &paddq ($car1,$acc0); # +=ap[j]*bp[0]; - &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] - &psrlq ($car0,32); - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= - &psrlq ($car1,32); - - &lea ($j,&DWP(1,$j)); - &cmp ($j,$num); - &jl (&label("1st")); - - &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] - &pmuludq($acc1,$mul1); # np[num-1]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= - - &psrlq ($car0,32); - &psrlq ($car1,32); - - &paddq ($car1,$car0); - &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] - - &inc ($i); # i++ -&set_label("outer"); - &xor ($j,$j); # j=0 - - &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] - &movd ($mul1,&DWP(0,$ap)); # ap[0] - &movd ($temp,&DWP($frame,"esp")); # tp[0] - &movd ($car1,&DWP(0,$np)); # np[0] - &pmuludq($mul1,$mul0); # ap[0]*bp[i] - - &paddq ($mul1,$temp); # +=tp[0] - &movq ($acc0,$mul1); - &movq ($car0,$mul1); - &pand ($acc0,$mask); - - &pmuludq($mul1,$_n0q); # *=n0 - - &pmuludq($car1,$mul1); - &paddq ($car1,$acc0); - - &movd ($temp,&DWP($frame+4,"esp")); # tp[1] - &movd ($acc1,&DWP(4,$np)); # np[1] - &movd ($acc0,&DWP(4,$ap)); # ap[1] - - &psrlq ($car0,32); - &psrlq ($car1,32); - &paddq ($car0,$temp); # +=tp[1] - - &inc ($j); # j++ - &dec ($num); -&set_label("inner"); - &pmuludq($acc0,$mul0); # ap[j]*bp[i] - &pmuludq($acc1,$mul1); # np[j]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] - &pand ($acc0,$mask); - &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] - &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] - &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] - &psrlq ($car0,32); - &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= - &psrlq ($car1,32); - &paddq ($car0,$temp); # +=tp[j+1] - - &dec ($num); - &lea ($j,&DWP(1,$j)); # j++ - &jnz (&label("inner")); - - &mov ($num,$j); - &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] - &pmuludq($acc1,$mul1); # np[num-1]*m1 - &paddq ($car0,$acc0); # +=c0 - &paddq ($car1,$acc1); # +=c1 - - &movq ($acc0,$car0); - &pand ($acc0,$mask); - &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] - &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= - &psrlq ($car0,32); - &psrlq ($car1,32); - - &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] - &paddq ($car1,$car0); - &paddq ($car1,$temp); - &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] - - &lea ($i,&DWP(1,$i)); # i++ - &cmp ($i,$num); - &jle (&label("outer")); - - &emms (); # done with mmx bank - &jmp (&label("common_tail")); - -&set_label("non_sse2",16); -} - -if (0) { - &mov ("esp",$_sp); - &xor ("eax","eax"); # signal "not fast enough [yet]" - &jmp (&label("just_leave")); - # While the below code provides competitive performance for - # all key lengthes on modern Intel cores, it's still more - # than 10% slower for 4096-bit key elsewhere:-( "Competitive" - # means compared to the original integer-only assembler. - # 512-bit RSA sign is better by ~40%, but that's about all - # one can say about all CPUs... -} else { -$inp="esi"; # integer path uses these registers differently -$word="edi"; -$carry="ebp"; - - &mov ($inp,$_ap); - &lea ($carry,&DWP(1,$num)); - &mov ($word,$_bp); - &xor ($j,$j); # j=0 - &mov ("edx",$inp); - &and ($carry,1); # see if num is even - &sub ("edx",$word); # see if ap==bp - &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] - &or ($carry,"edx"); - &mov ($word,&DWP(0,$word)); # bp[0] - &jz (&label("bn_sqr_mont")); - &mov ($_bpend,"eax"); - &mov ("eax",&DWP(0,$inp)); - &xor ("edx","edx"); - -&set_label("mull",16); - &mov ($carry,"edx"); - &mul ($word); # ap[j]*bp[0] - &add ($carry,"eax"); - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] - &cmp ($j,$num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("mull")); - - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*bp[0] - &mov ($word,$_n0); - &add ("eax",$carry); - &mov ($inp,$_np); - &adc ("edx",0); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= - &xor ($j,$j); - &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= - &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= - - &mov ("eax",&DWP(0,$inp)); # np[0] - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ("eax",&DWP(4,$inp)); # np[1] - &adc ("edx",0); - &inc ($j); - - &jmp (&label("2ndmadd")); - -&set_label("1stmadd",16); - &mov ($carry,"edx"); - &mul ($word); # ap[j]*bp[i] - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("1stmadd")); - - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*bp[i] - &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &mov ($word,$_n0); - &adc ("edx",0); - &mov ($inp,$_np); - &add ($carry,"eax"); - &adc ("edx",0); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &xor ($j,$j); - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= - &adc ($j,0); - &mov ("eax",&DWP(0,$inp)); # np[0] - &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= - &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ("eax",&DWP(4,$inp)); # np[1] - &adc ("edx",0); - &mov ($j,1); - -&set_label("2ndmadd",16); - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= - &jl (&label("2ndmadd")); - - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &adc ("edx",0); - &add ($carry,"eax"); - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= - - &xor ("eax","eax"); - &mov ($j,$_bp); # &bp[i] - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] - &lea ($j,&DWP(4,$j)); - &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= - &cmp ($j,$_bpend); - &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= - &je (&label("common_tail")); - - &mov ($word,&DWP(0,$j)); # bp[i+1] - &mov ($inp,$_ap); - &mov ($_bp,$j); # &bp[++i] - &xor ($j,$j); - &xor ("edx","edx"); - &mov ("eax",&DWP(0,$inp)); - &jmp (&label("1stmadd")); - -&set_label("bn_sqr_mont",16); -$sbit=$num; - &mov ($_num,$num); - &mov ($_bp,$j); # i=0 - - &mov ("eax",$word); # ap[0] - &mul ($word); # ap[0]*ap[0] - &mov (&DWP($frame,"esp"),"eax"); # tp[0]= - &mov ($sbit,"edx"); - &shr ("edx",1); - &and ($sbit,1); - &inc ($j); -&set_label("sqr",16); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] - &mov ($carry,"edx"); - &mul ($word); # ap[j]*ap[0] - &add ("eax",$carry); - &lea ($j,&DWP(1,$j)); - &adc ("edx",0); - &lea ($carry,&DWP(0,$sbit,"eax",2)); - &shr ("eax",31); - &cmp ($j,$_num); - &mov ($sbit,"eax"); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &jl (&label("sqr")); - - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] - &mov ($carry,"edx"); - &mul ($word); # ap[num-1]*ap[0] - &add ("eax",$carry); - &mov ($word,$_n0); - &adc ("edx",0); - &mov ($inp,$_np); - &lea ($carry,&DWP(0,$sbit,"eax",2)); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - &shr ("eax",31); - &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= - - &lea ($carry,&DWP(0,"eax","edx",2)); - &mov ("eax",&DWP(0,$inp)); # np[0] - &shr ("edx",31); - &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= - &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &mov ($num,$j); - &adc ("edx",0); - &mov ("eax",&DWP(4,$inp)); # np[1] - &mov ($j,1); - -&set_label("3rdmadd",16); - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= - - &mov ($carry,"edx"); - &mul ($word); # np[j+1]*m - &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] - &lea ($j,&DWP(2,$j)); - &adc ("edx",0); - &add ($carry,"eax"); - &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] - &adc ("edx",0); - &cmp ($j,$num); - &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= - &jl (&label("3rdmadd")); - - &mov ($carry,"edx"); - &mul ($word); # np[j]*m - &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] - &adc ("edx",0); - &add ($carry,"eax"); - &adc ("edx",0); - &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= - - &mov ($j,$_bp); # i - &xor ("eax","eax"); - &mov ($inp,$_ap); - &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] - &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] - &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= - &cmp ($j,$num); - &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= - &je (&label("common_tail")); - - &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] - &lea ($j,&DWP(1,$j)); - &mov ("eax",$word); - &mov ($_bp,$j); # ++i - &mul ($word); # ap[i]*ap[i] - &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] - &adc ("edx",0); - &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= - &xor ($carry,$carry); - &cmp ($j,$num); - &lea ($j,&DWP(1,$j)); - &je (&label("sqrlast")); - - &mov ($sbit,"edx"); # zaps $num - &shr ("edx",1); - &and ($sbit,1); -&set_label("sqradd",16); - &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] - &mov ($carry,"edx"); - &mul ($word); # ap[j]*ap[i] - &add ("eax",$carry); - &lea ($carry,&DWP(0,"eax","eax")); - &adc ("edx",0); - &shr ("eax",31); - &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] - &lea ($j,&DWP(1,$j)); - &adc ("eax",0); - &add ($carry,$sbit); - &adc ("eax",0); - &cmp ($j,$_num); - &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= - &mov ($sbit,"eax"); - &jle (&label("sqradd")); - - &mov ($carry,"edx"); - &lea ("edx",&DWP(0,$sbit,"edx",2)); - &shr ($carry,31); -&set_label("sqrlast"); - &mov ($word,$_n0); - &mov ($inp,$_np); - &imul ($word,&DWP($frame,"esp")); # n0*tp[0] - - &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] - &mov ("eax",&DWP(0,$inp)); # np[0] - &adc ($carry,0); - &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= - &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= - - &mul ($word); # np[0]*m - &add ("eax",&DWP($frame,"esp")); # +=tp[0] - &lea ($num,&DWP(-1,$j)); - &adc ("edx",0); - &mov ($j,1); - &mov ("eax",&DWP(4,$inp)); # np[1] - - &jmp (&label("3rdmadd")); -} - -&set_label("common_tail",16); - &mov ($np,$_np); # load modulus pointer - &mov ($rp,$_rp); # load result pointer - &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] - - &mov ("eax",&DWP(0,$tp)); # tp[0] - &mov ($j,$num); # j=num-1 - &xor ($i,$i); # i=0 and clear CF! - -&set_label("sub",16); - &sbb ("eax",&DWP(0,$np,$i,4)); - &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] - &dec ($j); # doesn't affect CF! - &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] - &lea ($i,&DWP(1,$i)); # i++ - &jge (&label("sub")); - - &sbb ("eax",0); # handle upmost overflow bit - &and ($tp,"eax"); - ¬ ("eax"); - &mov ($np,$rp); - &and ($np,"eax"); - &or ($tp,$np); # tp=carry?tp:rp - -&set_label("copy",16); # copy or in-place refresh - &mov ("eax",&DWP(0,$tp,$num,4)); - &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] - &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector - &dec ($num); - &jge (&label("copy")); - - &mov ("esp",$_sp); # pull saved stack pointer - &mov ("eax",1); -&set_label("just_leave"); -&function_end("bn_mul_mont"); - -&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); diff --git a/openssl/crypto/bn/asm/ppc.pl b/openssl/crypto/bn/asm/ppc.pl index 08e005347..37c65d351 100644 --- a/openssl/crypto/bn/asm/ppc.pl +++ b/openssl/crypto/bn/asm/ppc.pl @@ -100,9 +100,9 @@ # me a note at schari@us.ibm.com # -$opf = shift; +$flavour = shift; -if ($opf =~ /32\.s/) { +if ($flavour =~ /32/) { $BITS= 32; $BNSZ= $BITS/8; $ISA= "\"ppc\""; @@ -125,7 +125,7 @@ if ($opf =~ /32\.s/) { $INSR= "insrwi"; # insert right $ROTL= "rotlwi"; # rotate left by immediate $TR= "tw"; # conditional trap -} elsif ($opf =~ /64\.s/) { +} elsif ($flavour =~ /64/) { $BITS= 64; $BNSZ= $BITS/8; $ISA= "\"ppc64\""; @@ -149,93 +149,16 @@ if ($opf =~ /32\.s/) { $INSR= "insrdi"; # insert right $ROTL= "rotldi"; # rotate left by immediate $TR= "td"; # conditional trap -} else { die "nonsense $opf"; } +} else { die "nonsense $flavour"; } -( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; -# function entry points from the AIX code -# -# There are other, more elegant, ways to handle this. We (IBM) chose -# this approach as it plays well with scripts we run to 'namespace' -# OpenSSL .i.e. we add a prefix to all the public symbols so we can -# co-exist in the same process with other implementations of OpenSSL. -# 'cleverer' ways of doing these substitutions tend to hide data we -# need to be obvious. -# -my @items = ("bn_sqr_comba4", - "bn_sqr_comba8", - "bn_mul_comba4", - "bn_mul_comba8", - "bn_sub_words", - "bn_add_words", - "bn_div_words", - "bn_sqr_words", - "bn_mul_words", - "bn_mul_add_words"); +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; -if ($opf =~ /linux/) { do_linux(); } -elsif ($opf =~ /aix/) { do_aix(); } -elsif ($opf =~ /osx/) { do_osx(); } -else { do_bsd(); } - -sub do_linux { - $d=&data(); - - if ($BITS==64) { - foreach $t (@items) { - $d =~ s/\.$t:/\ -\t.section\t".opd","aw"\ -\t.align\t3\ -\t.globl\t$t\ -$t:\ -\t.quad\t.$t,.TOC.\@tocbase,0\ -\t.size\t$t,24\ -\t.previous\n\ -\t.type\t.$t,\@function\ -\t.globl\t.$t\ -.$t:/g; - } - } - else { - foreach $t (@items) { - $d=~s/\.$t/$t/g; - } - } - # hide internal labels to avoid pollution of name table... - $d=~s/Lppcasm_/.Lppcasm_/gm; - print $d; -} - -sub do_aix { - # AIX assembler is smart enough to please the linker without - # making us do something special... - print &data(); -} - -# MacOSX 32 bit -sub do_osx { - $d=&data(); - # Change the bn symbol prefix from '.' to '_' - foreach $t (@items) { - $d=~s/\.$t/_$t/g; - } - # Change .machine to something OS X asm will accept - $d=~s/\.machine.*/.text/g; - $d=~s/\#/;/g; # change comment from '#' to ';' - print $d; -} - -# BSD (Untested) -sub do_bsd { - $d=&data(); - foreach $t (@items) { - $d=~s/\.$t/_$t/g; - } - print $d; -} - -sub data { - local($data)=<<EOF; +$data=<<EOF; #-------------------------------------------------------------------- # # @@ -297,33 +220,20 @@ sub data { # # Defines to be used in the assembly code. # -.set r0,0 # we use it as storage for value of 0 -.set SP,1 # preserved -.set RTOC,2 # preserved -.set r3,3 # 1st argument/return value -.set r4,4 # 2nd argument/volatile register -.set r5,5 # 3rd argument/volatile register -.set r6,6 # ... -.set r7,7 -.set r8,8 -.set r9,9 -.set r10,10 -.set r11,11 -.set r12,12 -.set r13,13 # not used, nor any other "below" it... - -.set BO_IF_NOT,4 -.set BO_IF,12 -.set BO_dCTR_NZERO,16 -.set BO_dCTR_ZERO,18 -.set BO_ALWAYS,20 -.set CR0_LT,0; -.set CR0_GT,1; -.set CR0_EQ,2 -.set CR1_FX,4; -.set CR1_FEX,5; -.set CR1_VX,6 -.set LR,8 +#.set r0,0 # we use it as storage for value of 0 +#.set SP,1 # preserved +#.set RTOC,2 # preserved +#.set r3,3 # 1st argument/return value +#.set r4,4 # 2nd argument/volatile register +#.set r5,5 # 3rd argument/volatile register +#.set r6,6 # ... +#.set r7,7 +#.set r8,8 +#.set r9,9 +#.set r10,10 +#.set r11,11 +#.set r12,12 +#.set r13,13 # not used, nor any other "below" it... # Declare function names to be global # NOTE: For gcc these names MUST be changed to remove @@ -344,7 +254,7 @@ sub data { # .text section - .machine $ISA + .machine "any" # # NOTE: The following label name should be changed to @@ -478,7 +388,7 @@ sub data { $ST r9,`6*$BNSZ`(r3) #r[6]=c1 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -903,7 +813,7 @@ sub data { $ST r9, `15*$BNSZ`(r3) #r[15]=c1; - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 @@ -1055,7 +965,7 @@ sub data { $ST r10,`6*$BNSZ`(r3) #r[6]=c1 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -1591,7 +1501,7 @@ sub data { adde r10,r10,r9 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; $ST r10,`15*$BNSZ`(r3) #r[15]=c1; - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -1623,7 +1533,7 @@ sub data { subfc. r7,r0,r6 # If r6 is 0 then result is 0. # if r6 > 0 then result !=0 # In either case carry bit is set. - bc BO_IF,CR0_EQ,Lppcasm_sub_adios + beq Lppcasm_sub_adios addi r4,r4,-$BNSZ addi r3,r3,-$BNSZ addi r5,r5,-$BNSZ @@ -1635,11 +1545,11 @@ Lppcasm_sub_mainloop: # if carry = 1 this is r7-r8. Else it # is r7-r8 -1 as we need. $STU r6,$BNSZ(r3) - bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop + bdnz- Lppcasm_sub_mainloop Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 @@ -1670,7 +1580,7 @@ Lppcasm_sub_adios: # check for r6 = 0. Is this needed? # addic. r6,r6,0 #test r6 and clear carry bit. - bc BO_IF,CR0_EQ,Lppcasm_add_adios + beq Lppcasm_add_adios addi r4,r4,-$BNSZ addi r3,r3,-$BNSZ addi r5,r5,-$BNSZ @@ -1680,10 +1590,10 @@ Lppcasm_add_mainloop: $LDU r8,$BNSZ(r5) adde r8,r7,r8 $STU r8,$BNSZ(r3) - bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop + bdnz- Lppcasm_add_mainloop Lppcasm_add_adios: addze r3,r0 #return carry bit. - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -1707,24 +1617,24 @@ Lppcasm_add_adios: # r5 = d $UCMPI 0,r5,0 # compare r5 and 0 - bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 + bne Lppcasm_div1 # proceed if d!=0 li r3,-1 # d=0 return -1 - bclr BO_ALWAYS,CR0_LT + blr Lppcasm_div1: xor r0,r0,r0 #r0=0 li r8,$BITS $CNTLZ. r7,r5 #r7 = num leading 0s in d. - bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros + beq Lppcasm_div2 #proceed if no leading zeros subf r8,r7,r8 #r8 = BN_num_bits_word(d) $SHR. r9,r3,r8 #are there any bits above r8'th? $TR 16,r9,r0 #if there're, signal to dump core... Lppcasm_div2: $UCMP 0,r3,r5 #h>=d? - bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not + blt Lppcasm_div3 #goto Lppcasm_div3 if not subf r3,r5,r3 #h-=d ; Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i cmpi 0,0,r7,0 # is (i == 0)? - bc BO_IF,CR0_EQ,Lppcasm_div4 + beq Lppcasm_div4 $SHL r3,r3,r7 # h = (h<< i) $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) $SHL r5,r5,r7 # d<<=i @@ -1741,7 +1651,7 @@ Lppcasm_divouterloop: $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 # compute here for innerloop. $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh - bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not + bne Lppcasm_div5 # goto Lppcasm_div5 if not li r8,-1 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l @@ -1762,9 +1672,9 @@ Lppcasm_divinnerloop: # the following 2 instructions do that $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) - $UCMP 1,r6,r7 # compare (tl <= r7) - bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit - bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit + $UCMP cr1,r6,r7 # compare (tl <= r7) + bne Lppcasm_divinnerexit + ble cr1,Lppcasm_divinnerexit addi r8,r8,-1 #q-- subf r12,r9,r12 #th -=dh $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. @@ -1773,14 +1683,14 @@ Lppcasm_divinnerloop: Lppcasm_divinnerexit: $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; - $UCMP 1,r4,r11 # compare l and tl + $UCMP cr1,r4,r11 # compare l and tl add r12,r12,r10 # th+=t - bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 + bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 addi r12,r12,1 # th++ Lppcasm_div7: subf r11,r11,r4 #r11=l-tl - $UCMP 1,r3,r12 #compare h and th - bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 + $UCMP cr1,r3,r12 #compare h and th + bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 addi r8,r8,-1 # q-- add r3,r5,r3 # h+=d Lppcasm_div8: @@ -1791,12 +1701,12 @@ Lppcasm_div8: # the following 2 instructions will do this. $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 - bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; + bdz Lppcasm_div9 #if (count==0) break ; $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 b Lppcasm_divouterloop Lppcasm_div9: or r3,r8,r0 - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -1822,7 +1732,7 @@ Lppcasm_div9: # No unrolling done here. Not performance critical. addic. r5,r5,0 #test r5. - bc BO_IF,CR0_EQ,Lppcasm_sqr_adios + beq Lppcasm_sqr_adios addi r4,r4,-$BNSZ addi r3,r3,-$BNSZ mtctr r5 @@ -1833,9 +1743,9 @@ Lppcasm_sqr_mainloop: $UMULH r8,r6,r6 $STU r7,$BNSZ(r3) $STU r8,$BNSZ(r3) - bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop + bdnz- Lppcasm_sqr_mainloop Lppcasm_sqr_adios: - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 @@ -1858,7 +1768,7 @@ Lppcasm_sqr_adios: xor r0,r0,r0 xor r12,r12,r12 # used for carry rlwinm. r7,r5,30,2,31 # num >> 2 - bc BO_IF,CR0_EQ,Lppcasm_mw_REM + beq Lppcasm_mw_REM mtctr r7 Lppcasm_mw_LOOP: #mul(rp[0],ap[0],w,c1); @@ -1896,11 +1806,11 @@ Lppcasm_mw_LOOP: addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` - bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP + bdnz- Lppcasm_mw_LOOP Lppcasm_mw_REM: andi. r5,r5,0x3 - bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + beq Lppcasm_mw_OVER #mul(rp[0],ap[0],w,c1); $LD r8,`0*$BNSZ`(r4) $UMULL r9,r6,r8 @@ -1912,7 +1822,7 @@ Lppcasm_mw_REM: addi r5,r5,-1 cmpli 0,0,r5,0 - bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + beq Lppcasm_mw_OVER #mul(rp[1],ap[1],w,c1); @@ -1926,7 +1836,7 @@ Lppcasm_mw_REM: addi r5,r5,-1 cmpli 0,0,r5,0 - bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + beq Lppcasm_mw_OVER #mul_add(rp[2],ap[2],w,c1); $LD r8,`2*$BNSZ`(r4) @@ -1939,7 +1849,7 @@ Lppcasm_mw_REM: Lppcasm_mw_OVER: addi r3,r12,0 - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 # @@ -1964,7 +1874,7 @@ Lppcasm_mw_OVER: xor r0,r0,r0 #r0 = 0 xor r12,r12,r12 #r12 = 0 . used for carry rlwinm. r7,r5,30,2,31 # num >> 2 - bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover + beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover mtctr r7 Lppcasm_maw_mainloop: #mul_add(rp[0],ap[0],w,c1); @@ -2017,11 +1927,11 @@ Lppcasm_maw_mainloop: $ST r11,`3*$BNSZ`(r3) addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` - bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop + bdnz- Lppcasm_maw_mainloop Lppcasm_maw_leftover: andi. r5,r5,0x3 - bc BO_IF,CR0_EQ,Lppcasm_maw_adios + beq Lppcasm_maw_adios addi r3,r3,-$BNSZ addi r4,r4,-$BNSZ #mul_add(rp[0],ap[0],w,c1); @@ -2036,7 +1946,7 @@ Lppcasm_maw_leftover: addze r12,r10 $ST r9,0(r3) - bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios + bdz Lppcasm_maw_adios #mul_add(rp[1],ap[1],w,c1); $LDU r8,$BNSZ(r4) $UMULL r9,r6,r8 @@ -2048,7 +1958,7 @@ Lppcasm_maw_leftover: addze r12,r10 $ST r9,0(r3) - bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios + bdz Lppcasm_maw_adios #mul_add(rp[2],ap[2],w,c1); $LDU r8,$BNSZ(r4) $UMULL r9,r6,r8 @@ -2062,17 +1972,10 @@ Lppcasm_maw_leftover: Lppcasm_maw_adios: addi r3,r12,0 - bclr BO_ALWAYS,CR0_LT + blr .long 0x00000000 .align 4 EOF - $data =~ s/\`([^\`]*)\`/eval $1/gem; - - # if some assembler chokes on some simplified mnemonic, - # this is the spot to fix it up, e.g.: - # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare - $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; - # assembler X doesn't accept li, load immediate value - #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; - return($data); -} +$data =~ s/\`([^\`]*)\`/eval $1/gem; +print $data; +close STDOUT; diff --git a/openssl/crypto/bn/asm/sparcv8plus.S b/openssl/crypto/bn/asm/sparcv8plus.S index 8c56e2e7e..63de1860f 100644 --- a/openssl/crypto/bn/asm/sparcv8plus.S +++ b/openssl/crypto/bn/asm/sparcv8plus.S @@ -144,6 +144,19 @@ * } */ +#if defined(__SUNPRO_C) && defined(__sparcv9) + /* They've said -xarch=v9 at command line */ + .register %g2,#scratch + .register %g3,#scratch +# define FRAME_SIZE -192 +#elif defined(__GNUC__) && defined(__arch64__) + /* They've said -m64 at command line */ + .register %g2,#scratch + .register %g3,#scratch +# define FRAME_SIZE -192 +#else +# define FRAME_SIZE -96 +#endif /* * GNU assembler can't stand stuw:-( */ @@ -619,8 +632,6 @@ bn_sub_words: * Andy. */ -#define FRAME_SIZE -96 - /* * Here is register usage map for *all* routines below. */ diff --git a/openssl/crypto/bn/asm/x86_64-gcc.c b/openssl/crypto/bn/asm/x86_64-gcc.c index f13f52dd8..acb0b4011 100644 --- a/openssl/crypto/bn/asm/x86_64-gcc.c +++ b/openssl/crypto/bn/asm/x86_64-gcc.c @@ -1,4 +1,5 @@ -#ifdef __SUNPRO_C +#include "../bn_lcl.h" +#if !(defined(__GNUC__) && __GNUC__>=2) # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ #else /* @@ -54,7 +55,15 @@ * machine. */ +#ifdef _WIN64 +#define BN_ULONG unsigned long long +#else #define BN_ULONG unsigned long +#endif + +#undef mul +#undef mul_add +#undef sqr /* * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; @@ -97,7 +106,7 @@ : "a"(a) \ : "cc"); -BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { BN_ULONG c1=0; @@ -121,7 +130,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) return(c1); } -BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { BN_ULONG c1=0; @@ -144,7 +153,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) return(c1); } -void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) +void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) { if (n <= 0) return; @@ -175,14 +184,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) return ret; } -BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) { BN_ULONG ret=0,i=0; if (n <= 0) return 0; asm ( " subq %2,%2 \n" - ".align 16 \n" + ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " adcq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" @@ -198,14 +207,14 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) } #ifndef SIMICS -BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) +BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) { BN_ULONG ret=0,i=0; if (n <= 0) return 0; asm ( " subq %2,%2 \n" - ".align 16 \n" + ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " sbbq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" @@ -485,7 +494,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) r[7]=c2; } -void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) +void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; @@ -561,7 +570,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) r[15]=c1; } -void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) +void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; diff --git a/openssl/crypto/bn/asm/x86_64-mont.pl b/openssl/crypto/bn/asm/x86_64-mont.pl index c43b69592..3b7a6f243 100644 --- a/openssl/crypto/bn/asm/x86_64-mont.pl +++ b/openssl/crypto/bn/asm/x86_64-mont.pl @@ -15,14 +15,18 @@ # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... -$output=shift; +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open STDOUT,"| $^X $xlate $output"; +open STDOUT,"| $^X $xlate $flavour $output"; # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, @@ -55,13 +59,14 @@ bn_mul_mont: push %r15 mov ${num}d,${num}d - lea 2($num),%rax - mov %rsp,%rbp - neg %rax - lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) + lea 2($num),%r10 + mov %rsp,%r11 + neg %r10 + lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) and \$-1024,%rsp # minimize TLB usage - mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp + mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lprologue: mov %rdx,$bp # $bp reassigned, remember? mov ($n0),$n0 # pull n0[0] value @@ -197,18 +202,129 @@ bn_mul_mont: dec $j jge .Lcopy - mov 8(%rsp,$num,8),%rsp # restore %rsp + mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue: + ret +.size bn_mul_mont,.-bn_mul_mont +.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 16 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov 192($context),%r10 # pull $num + mov 8(%rax,%r10,8),%rax # pull saved stack pointer + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx + pop %rdi + pop %rsi ret -.size bn_mul_mont,.-bn_mul_mont -.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_bn_mul_mont + .rva .LSEH_end_bn_mul_mont + .rva .LSEH_info_bn_mul_mont + +.section .xdata +.align 8 +.LSEH_info_bn_mul_mont: + .byte 9,0,0,0 + .rva se_handler ___ +} print $code; close STDOUT; |