aboutsummaryrefslogtreecommitdiff
path: root/openssl/crypto/bn/asm
diff options
context:
space:
mode:
Diffstat (limited to 'openssl/crypto/bn/asm')
-rw-r--r--openssl/crypto/bn/asm/armv4-mont.pl1
-rw-r--r--openssl/crypto/bn/asm/bn-586.pl203
-rw-r--r--openssl/crypto/bn/asm/co-586.pl3
-rw-r--r--openssl/crypto/bn/asm/mo-586.pl603
-rw-r--r--openssl/crypto/bn/asm/ppc.pl233
-rw-r--r--openssl/crypto/bn/asm/sparcv8plus.S15
-rw-r--r--openssl/crypto/bn/asm/x86_64-gcc.c29
-rw-r--r--openssl/crypto/bn/asm/x86_64-mont.pl136
8 files changed, 380 insertions, 843 deletions
diff --git a/openssl/crypto/bn/asm/armv4-mont.pl b/openssl/crypto/bn/asm/armv4-mont.pl
index 05d5dc1a4..14e0d2d1d 100644
--- a/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/openssl/crypto/bn/asm/armv4-mont.pl
@@ -193,6 +193,7 @@ bn_mul_mont:
bx lr @ interoperable with Thumb ISA:-)
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
___
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
diff --git a/openssl/crypto/bn/asm/bn-586.pl b/openssl/crypto/bn/asm/bn-586.pl
index 26c2685a7..332ef3e91 100644
--- a/openssl/crypto/bn/asm/bn-586.pl
+++ b/openssl/crypto/bn/asm/bn-586.pl
@@ -1,6 +1,7 @@
#!/usr/local/bin/perl
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
@@ -24,38 +25,25 @@ sub bn_mul_add_words
{
local($name)=@_;
- &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
- &comment("");
- $Low="eax";
- $High="edx";
- $a="ebx";
- $w="ebp";
- $r="edi";
- $c="esi";
-
- &xor($c,$c); # clear carry
- &mov($r,&wparam(0)); #
-
- &mov("ecx",&wparam(2)); #
- &mov($a,&wparam(1)); #
-
- &and("ecx",0xfffffff8); # num / 8
- &mov($w,&wparam(3)); #
-
- &push("ecx"); # Up the stack for a tmp variable
-
- &jz(&label("maw_finish"));
+ $r="eax";
+ $a="edx";
+ $c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
- &jnc(&label("maw_loop"));
+ &jnc(&label("maw_non_sse2"));
- &movd("mm0",$w); # mm0 = w
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
-
- &set_label("maw_sse2_loop",0);
+ &jmp(&label("maw_sse2_entry"));
+
+ &set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
@@ -112,42 +100,82 @@ sub bn_mul_add_words
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
- &add($r,32);
+ &lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
- &sub("ecx",8);
+ &sub($c,8);
+ &jz(&label("maw_sse2_exit"));
+ &set_label("maw_sse2_entry");
+ &test($c,0xfffffff8);
+ &jnz(&label("maw_sse2_unrolled"));
+
+ &set_label("maw_sse2_loop",4);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm3"); # carry += r[i]
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
-
- &movd($c,"mm1"); # c = carry_out
+ &set_label("maw_sse2_exit");
+ &movd("eax","mm1"); # c = carry_out
&emms();
+ &ret();
- &jmp(&label("maw_finish"));
+ &set_label("maw_non_sse2",16);
}
- &set_label("maw_loop",0);
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ebp";
+ $r="edi";
+ $c="esi";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+
+ &mov("ecx",&wparam(2)); #
+ &mov($a,&wparam(1)); #
+
+ &and("ecx",0xfffffff8); # num / 8
+ &mov($w,&wparam(3)); #
- &mov(&swtmp(0),"ecx"); #
+ &push("ecx"); # Up the stack for a tmp variable
+
+ &jz(&label("maw_finish"));
+
+ &set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
- &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
- &add("eax",$c); # L(t)+= *r
- &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
+ &add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
- &add("eax",$c); # L(t)+=c
+ &add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
- &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+ &mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
- &mov("ecx",&swtmp(0)); #
- &add($a,32);
- &add($r,32);
&sub("ecx",8);
+ &lea($a,&DWP(32,$a));
+ &lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
@@ -160,16 +188,15 @@ sub bn_mul_add_words
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
- &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
- &add("eax",$c);
+ &add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
- &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
+ &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
@@ -184,7 +211,45 @@ sub bn_mul_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("mw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry = 0
+
+ &set_label("mw_sse2_loop",16);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("mw_sse2_loop"));
+
+ &movd("eax","mm1"); # return carry
+ &emms();
+ &ret();
+ &set_label("mw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
&comment("");
$Low="eax";
@@ -257,7 +322,40 @@ sub bn_sqr_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("sqr_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+
+ &set_label("sqr_sse2_loop",16);
+ &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
+ &pmuludq("mm0","mm0"); # a[i] *= a[i]
+ &lea($a,&DWP(4,$a)); # a++
+ &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
+ &sub($c,1);
+ &lea($r,&DWP(8,$r)); # r += 2
+ &jnz(&label("sqr_sse2_loop"));
+
+ &emms();
+ &ret();
+ &set_label("sqr_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
&comment("");
$r="esi";
@@ -313,12 +411,13 @@ sub bn_div_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
- &mov("ebx",&wparam(2)); #
- &div("ebx");
- &function_end($name);
+ &mov("ecx",&wparam(2)); #
+ &div("ecx");
+ &ret();
+ &function_end_B($name);
}
sub bn_add_words
diff --git a/openssl/crypto/bn/asm/co-586.pl b/openssl/crypto/bn/asm/co-586.pl
index 5d962cb95..57101a6bd 100644
--- a/openssl/crypto/bn/asm/co-586.pl
+++ b/openssl/crypto/bn/asm/co-586.pl
@@ -1,6 +1,7 @@
#!/usr/local/bin/perl
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
diff --git a/openssl/crypto/bn/asm/mo-586.pl b/openssl/crypto/bn/asm/mo-586.pl
deleted file mode 100644
index 098229309..000000000
--- a/openssl/crypto/bn/asm/mo-586.pl
+++ /dev/null
@@ -1,603 +0,0 @@
-#!/usr/bin/env perl
-
-# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
-# from OpenSSL 0.9.9-dev
-
-sub ::asciz
-{ my @str=unpack("C*",shift);
- push @str,0;
- while ($#str>15) {
- &data_byte(@str[0..15]);
- foreach (0..15) { shift @str; }
- }
- &data_byte(@str) if (@str);
-}
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# October 2005
-#
-# This is a "teaser" code, as it can be improved in several ways...
-# First of all non-SSE2 path should be implemented (yes, for now it
-# performs Montgomery multiplication/convolution only on SSE2-capable
-# CPUs such as P4, others fall down to original code). Then inner loop
-# can be unrolled and modulo-scheduled to improve ILP and possibly
-# moved to 128-bit XMM register bank (though it would require input
-# rearrangement and/or increase bus bandwidth utilization). Dedicated
-# squaring procedure should give further performance improvement...
-# Yet, for being draft, the code improves rsa512 *sign* benchmark by
-# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
-
-# December 2006
-#
-# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
-# Integer-only code [being equipped with dedicated squaring procedure]
-# gives ~40% on rsa512 sign benchmark...
-
-push(@INC,"perlasm","../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],$0);
-
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
-&function_begin("bn_mul_mont");
-
-$i="edx";
-$j="ecx";
-$ap="esi"; $tp="esi"; # overlapping variables!!!
-$rp="edi"; $bp="edi"; # overlapping variables!!!
-$np="ebp";
-$num="ebx";
-
-$_num=&DWP(4*0,"esp"); # stack top layout
-$_rp=&DWP(4*1,"esp");
-$_ap=&DWP(4*2,"esp");
-$_bp=&DWP(4*3,"esp");
-$_np=&DWP(4*4,"esp");
-$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
-$_sp=&DWP(4*6,"esp");
-$_bpend=&DWP(4*7,"esp");
-$frame=32; # size of above frame rounded up to 16n
-
- &xor ("eax","eax");
- &mov ("edi",&wparam(5)); # int num
- &cmp ("edi",4);
- &jl (&label("just_leave"));
-
- &lea ("esi",&wparam(0)); # put aside pointer to argument block
- &lea ("edx",&wparam(1)); # load ap
- &mov ("ebp","esp"); # saved stack pointer!
- &add ("edi",2); # extra two words on top of tp
- &neg ("edi");
- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
- &neg ("edi");
-
- # minimize cache contention by arraning 2K window between stack
- # pointer and ap argument [np is also position sensitive vector,
- # but it's assumed to be near ap, as it's allocated at ~same
- # time].
- &mov ("eax","esp");
- &sub ("eax","edx");
- &and ("eax",2047);
- &sub ("esp","eax"); # this aligns sp and ap modulo 2048
-
- &xor ("edx","esp");
- &and ("edx",2048);
- &xor ("edx",2048);
- &sub ("esp","edx"); # this splits them apart modulo 4096
-
- &and ("esp",-64); # align to cache line
-
- ################################# load argument block...
- &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
- &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
- &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
- &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
- #&mov ("edi",&DWP(5*4,"esi"));# int num
-
- &mov ("esi",&DWP(0,"esi")); # pull n0[0]
- &mov ($_rp,"eax"); # ... save a copy of argument block
- &mov ($_ap,"ebx");
- &mov ($_bp,"ecx");
- &mov ($_np,"edx");
- &mov ($_n0,"esi");
- &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
- #&mov ($_num,$num); # redundant as $num is not reused
- &mov ($_sp,"ebp"); # saved stack pointer!
-
-if($sse2) {
-$acc0="mm0"; # mmx register bank layout
-$acc1="mm1";
-$car0="mm2";
-$car1="mm3";
-$mul0="mm4";
-$mul1="mm5";
-$temp="mm6";
-$mask="mm7";
-
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"eax"),26);
- &jnc (&label("non_sse2"));
-
- &mov ("eax",-1);
- &movd ($mask,"eax"); # mask 32 lower bits
-
- &mov ($ap,$_ap); # load input pointers
- &mov ($bp,$_bp);
- &mov ($np,$_np);
-
- &xor ($i,$i); # i=0
- &xor ($j,$j); # j=0
-
- &movd ($mul0,&DWP(0,$bp)); # bp[0]
- &movd ($mul1,&DWP(0,$ap)); # ap[0]
- &movd ($car1,&DWP(0,$np)); # np[0]
-
- &pmuludq($mul1,$mul0); # ap[0]*bp[0]
- &movq ($car0,$mul1);
- &movq ($acc0,$mul1); # I wish movd worked for
- &pand ($acc0,$mask); # inter-register transfers
-
- &pmuludq($mul1,$_n0q); # *=n0
-
- &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
- &paddq ($car1,$acc0);
-
- &movd ($acc1,&DWP(4,$np)); # np[1]
- &movd ($acc0,&DWP(4,$ap)); # ap[1]
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &inc ($j); # j++
-&set_label("1st",16);
- &pmuludq($acc0,$mul0); # ap[j]*bp[0]
- &pmuludq($acc1,$mul1); # np[j]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
- &paddq ($car1,$acc0); # +=ap[j]*bp[0];
- &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
- &psrlq ($car0,32);
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
- &psrlq ($car1,32);
-
- &lea ($j,&DWP(1,$j));
- &cmp ($j,$num);
- &jl (&label("1st"));
-
- &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
- &pmuludq($acc1,$mul1); # np[num-1]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &paddq ($car1,$car0);
- &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
-
- &inc ($i); # i++
-&set_label("outer");
- &xor ($j,$j); # j=0
-
- &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
- &movd ($mul1,&DWP(0,$ap)); # ap[0]
- &movd ($temp,&DWP($frame,"esp")); # tp[0]
- &movd ($car1,&DWP(0,$np)); # np[0]
- &pmuludq($mul1,$mul0); # ap[0]*bp[i]
-
- &paddq ($mul1,$temp); # +=tp[0]
- &movq ($acc0,$mul1);
- &movq ($car0,$mul1);
- &pand ($acc0,$mask);
-
- &pmuludq($mul1,$_n0q); # *=n0
-
- &pmuludq($car1,$mul1);
- &paddq ($car1,$acc0);
-
- &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
- &movd ($acc1,&DWP(4,$np)); # np[1]
- &movd ($acc0,&DWP(4,$ap)); # ap[1]
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
- &paddq ($car0,$temp); # +=tp[1]
-
- &inc ($j); # j++
- &dec ($num);
-&set_label("inner");
- &pmuludq($acc0,$mul0); # ap[j]*bp[i]
- &pmuludq($acc1,$mul1); # np[j]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
- &pand ($acc0,$mask);
- &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
- &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
- &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
- &psrlq ($car0,32);
- &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
- &psrlq ($car1,32);
- &paddq ($car0,$temp); # +=tp[j+1]
-
- &dec ($num);
- &lea ($j,&DWP(1,$j)); # j++
- &jnz (&label("inner"));
-
- &mov ($num,$j);
- &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
- &pmuludq($acc1,$mul1); # np[num-1]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
- &paddq ($car1,$car0);
- &paddq ($car1,$temp);
- &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
-
- &lea ($i,&DWP(1,$i)); # i++
- &cmp ($i,$num);
- &jle (&label("outer"));
-
- &emms (); # done with mmx bank
- &jmp (&label("common_tail"));
-
-&set_label("non_sse2",16);
-}
-
-if (0) {
- &mov ("esp",$_sp);
- &xor ("eax","eax"); # signal "not fast enough [yet]"
- &jmp (&label("just_leave"));
- # While the below code provides competitive performance for
- # all key lengthes on modern Intel cores, it's still more
- # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
- # means compared to the original integer-only assembler.
- # 512-bit RSA sign is better by ~40%, but that's about all
- # one can say about all CPUs...
-} else {
-$inp="esi"; # integer path uses these registers differently
-$word="edi";
-$carry="ebp";
-
- &mov ($inp,$_ap);
- &lea ($carry,&DWP(1,$num));
- &mov ($word,$_bp);
- &xor ($j,$j); # j=0
- &mov ("edx",$inp);
- &and ($carry,1); # see if num is even
- &sub ("edx",$word); # see if ap==bp
- &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
- &or ($carry,"edx");
- &mov ($word,&DWP(0,$word)); # bp[0]
- &jz (&label("bn_sqr_mont"));
- &mov ($_bpend,"eax");
- &mov ("eax",&DWP(0,$inp));
- &xor ("edx","edx");
-
-&set_label("mull",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[0]
- &add ($carry,"eax");
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("mull"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[0]
- &mov ($word,$_n0);
- &add ("eax",$carry);
- &mov ($inp,$_np);
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
- &xor ($j,$j);
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &inc ($j);
-
- &jmp (&label("2ndmadd"));
-
-&set_label("1stmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[i]
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("1stmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[i]
- &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &add ($carry,"eax");
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &xor ($j,$j);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
- &adc ($j,0);
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &mov ($j,1);
-
-&set_label("2ndmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
- &jl (&label("2ndmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &xor ("eax","eax");
- &mov ($j,$_bp); # &bp[i]
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &lea ($j,&DWP(4,$j));
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$_bpend);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(0,$j)); # bp[i+1]
- &mov ($inp,$_ap);
- &mov ($_bp,$j); # &bp[++i]
- &xor ($j,$j);
- &xor ("edx","edx");
- &mov ("eax",&DWP(0,$inp));
- &jmp (&label("1stmadd"));
-
-&set_label("bn_sqr_mont",16);
-$sbit=$num;
- &mov ($_num,$num);
- &mov ($_bp,$j); # i=0
-
- &mov ("eax",$word); # ap[0]
- &mul ($word); # ap[0]*ap[0]
- &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
- &mov ($sbit,"edx");
- &shr ("edx",1);
- &and ($sbit,1);
- &inc ($j);
-&set_label("sqr",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[0]
- &add ("eax",$carry);
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &shr ("eax",31);
- &cmp ($j,$_num);
- &mov ($sbit,"eax");
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("sqr"));
-
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*ap[0]
- &add ("eax",$carry);
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
- &shr ("eax",31);
- &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
-
- &lea ($carry,&DWP(0,"eax","edx",2));
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &shr ("edx",31);
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
- &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ($num,$j);
- &adc ("edx",0);
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &mov ($j,1);
-
-&set_label("3rdmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
-
- &mov ($carry,"edx");
- &mul ($word); # np[j+1]*m
- &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
- &lea ($j,&DWP(2,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("3rdmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &mov ($j,$_bp); # i
- &xor ("eax","eax");
- &mov ($inp,$_ap);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$num);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
- &lea ($j,&DWP(1,$j));
- &mov ("eax",$word);
- &mov ($_bp,$j); # ++i
- &mul ($word); # ap[i]*ap[i]
- &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
- &adc ("edx",0);
- &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
- &xor ($carry,$carry);
- &cmp ($j,$num);
- &lea ($j,&DWP(1,$j));
- &je (&label("sqrlast"));
-
- &mov ($sbit,"edx"); # zaps $num
- &shr ("edx",1);
- &and ($sbit,1);
-&set_label("sqradd",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[i]
- &add ("eax",$carry);
- &lea ($carry,&DWP(0,"eax","eax"));
- &adc ("edx",0);
- &shr ("eax",31);
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("eax",0);
- &add ($carry,$sbit);
- &adc ("eax",0);
- &cmp ($j,$_num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &mov ($sbit,"eax");
- &jle (&label("sqradd"));
-
- &mov ($carry,"edx");
- &lea ("edx",&DWP(0,$sbit,"edx",2));
- &shr ($carry,31);
-&set_label("sqrlast");
- &mov ($word,$_n0);
- &mov ($inp,$_np);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &adc ($carry,0);
- &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &lea ($num,&DWP(-1,$j));
- &adc ("edx",0);
- &mov ($j,1);
- &mov ("eax",&DWP(4,$inp)); # np[1]
-
- &jmp (&label("3rdmadd"));
-}
-
-&set_label("common_tail",16);
- &mov ($np,$_np); # load modulus pointer
- &mov ($rp,$_rp); # load result pointer
- &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
-
- &mov ("eax",&DWP(0,$tp)); # tp[0]
- &mov ($j,$num); # j=num-1
- &xor ($i,$i); # i=0 and clear CF!
-
-&set_label("sub",16);
- &sbb ("eax",&DWP(0,$np,$i,4));
- &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
- &dec ($j); # doesn't affect CF!
- &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
- &lea ($i,&DWP(1,$i)); # i++
- &jge (&label("sub"));
-
- &sbb ("eax",0); # handle upmost overflow bit
- &and ($tp,"eax");
- &not ("eax");
- &mov ($np,$rp);
- &and ($np,"eax");
- &or ($tp,$np); # tp=carry?tp:rp
-
-&set_label("copy",16); # copy or in-place refresh
- &mov ("eax",&DWP(0,$tp,$num,4));
- &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
- &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
- &dec ($num);
- &jge (&label("copy"));
-
- &mov ("esp",$_sp); # pull saved stack pointer
- &mov ("eax",1);
-&set_label("just_leave");
-&function_end("bn_mul_mont");
-
-&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
diff --git a/openssl/crypto/bn/asm/ppc.pl b/openssl/crypto/bn/asm/ppc.pl
index 08e005347..37c65d351 100644
--- a/openssl/crypto/bn/asm/ppc.pl
+++ b/openssl/crypto/bn/asm/ppc.pl
@@ -100,9 +100,9 @@
# me a note at schari@us.ibm.com
#
-$opf = shift;
+$flavour = shift;
-if ($opf =~ /32\.s/) {
+if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$ISA= "\"ppc\"";
@@ -125,7 +125,7 @@ if ($opf =~ /32\.s/) {
$INSR= "insrwi"; # insert right
$ROTL= "rotlwi"; # rotate left by immediate
$TR= "tw"; # conditional trap
-} elsif ($opf =~ /64\.s/) {
+} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$ISA= "\"ppc64\"";
@@ -149,93 +149,16 @@ if ($opf =~ /32\.s/) {
$INSR= "insrdi"; # insert right
$ROTL= "rotldi"; # rotate left by immediate
$TR= "td"; # conditional trap
-} else { die "nonsense $opf"; }
+} else { die "nonsense $flavour"; }
-( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
-# function entry points from the AIX code
-#
-# There are other, more elegant, ways to handle this. We (IBM) chose
-# this approach as it plays well with scripts we run to 'namespace'
-# OpenSSL .i.e. we add a prefix to all the public symbols so we can
-# co-exist in the same process with other implementations of OpenSSL.
-# 'cleverer' ways of doing these substitutions tend to hide data we
-# need to be obvious.
-#
-my @items = ("bn_sqr_comba4",
- "bn_sqr_comba8",
- "bn_mul_comba4",
- "bn_mul_comba8",
- "bn_sub_words",
- "bn_add_words",
- "bn_div_words",
- "bn_sqr_words",
- "bn_mul_words",
- "bn_mul_add_words");
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-if ($opf =~ /linux/) { do_linux(); }
-elsif ($opf =~ /aix/) { do_aix(); }
-elsif ($opf =~ /osx/) { do_osx(); }
-else { do_bsd(); }
-
-sub do_linux {
- $d=&data();
-
- if ($BITS==64) {
- foreach $t (@items) {
- $d =~ s/\.$t:/\
-\t.section\t".opd","aw"\
-\t.align\t3\
-\t.globl\t$t\
-$t:\
-\t.quad\t.$t,.TOC.\@tocbase,0\
-\t.size\t$t,24\
-\t.previous\n\
-\t.type\t.$t,\@function\
-\t.globl\t.$t\
-.$t:/g;
- }
- }
- else {
- foreach $t (@items) {
- $d=~s/\.$t/$t/g;
- }
- }
- # hide internal labels to avoid pollution of name table...
- $d=~s/Lppcasm_/.Lppcasm_/gm;
- print $d;
-}
-
-sub do_aix {
- # AIX assembler is smart enough to please the linker without
- # making us do something special...
- print &data();
-}
-
-# MacOSX 32 bit
-sub do_osx {
- $d=&data();
- # Change the bn symbol prefix from '.' to '_'
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- # Change .machine to something OS X asm will accept
- $d=~s/\.machine.*/.text/g;
- $d=~s/\#/;/g; # change comment from '#' to ';'
- print $d;
-}
-
-# BSD (Untested)
-sub do_bsd {
- $d=&data();
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- print $d;
-}
-
-sub data {
- local($data)=<<EOF;
+$data=<<EOF;
#--------------------------------------------------------------------
#
#
@@ -297,33 +220,20 @@ sub data {
#
# Defines to be used in the assembly code.
#
-.set r0,0 # we use it as storage for value of 0
-.set SP,1 # preserved
-.set RTOC,2 # preserved
-.set r3,3 # 1st argument/return value
-.set r4,4 # 2nd argument/volatile register
-.set r5,5 # 3rd argument/volatile register
-.set r6,6 # ...
-.set r7,7
-.set r8,8
-.set r9,9
-.set r10,10
-.set r11,11
-.set r12,12
-.set r13,13 # not used, nor any other "below" it...
-
-.set BO_IF_NOT,4
-.set BO_IF,12
-.set BO_dCTR_NZERO,16
-.set BO_dCTR_ZERO,18
-.set BO_ALWAYS,20
-.set CR0_LT,0;
-.set CR0_GT,1;
-.set CR0_EQ,2
-.set CR1_FX,4;
-.set CR1_FEX,5;
-.set CR1_VX,6
-.set LR,8
+#.set r0,0 # we use it as storage for value of 0
+#.set SP,1 # preserved
+#.set RTOC,2 # preserved
+#.set r3,3 # 1st argument/return value
+#.set r4,4 # 2nd argument/volatile register
+#.set r5,5 # 3rd argument/volatile register
+#.set r6,6 # ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13 # not used, nor any other "below" it...
# Declare function names to be global
# NOTE: For gcc these names MUST be changed to remove
@@ -344,7 +254,7 @@ sub data {
# .text section
- .machine $ISA
+ .machine "any"
#
# NOTE: The following label name should be changed to
@@ -478,7 +388,7 @@ sub data {
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -903,7 +813,7 @@ sub data {
$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1055,7 +965,7 @@ sub data {
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1591,7 +1501,7 @@ sub data {
adde r10,r10,r9
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1623,7 +1533,7 @@ sub data {
subfc. r7,r0,r6 # If r6 is 0 then result is 0.
# if r6 > 0 then result !=0
# In either case carry bit is set.
- bc BO_IF,CR0_EQ,Lppcasm_sub_adios
+ beq Lppcasm_sub_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
@@ -1635,11 +1545,11 @@ Lppcasm_sub_mainloop:
# if carry = 1 this is r7-r8. Else it
# is r7-r8 -1 as we need.
$STU r6,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+ bdnz- Lppcasm_sub_mainloop
Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1670,7 +1580,7 @@ Lppcasm_sub_adios:
# check for r6 = 0. Is this needed?
#
addic. r6,r6,0 #test r6 and clear carry bit.
- bc BO_IF,CR0_EQ,Lppcasm_add_adios
+ beq Lppcasm_add_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
@@ -1680,10 +1590,10 @@ Lppcasm_add_mainloop:
$LDU r8,$BNSZ(r5)
adde r8,r7,r8
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+ bdnz- Lppcasm_add_mainloop
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1707,24 +1617,24 @@ Lppcasm_add_adios:
# r5 = d
$UCMPI 0,r5,0 # compare r5 and 0
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0
+ bne Lppcasm_div1 # proceed if d!=0
li r3,-1 # d=0 return -1
- bclr BO_ALWAYS,CR0_LT
+ blr
Lppcasm_div1:
xor r0,r0,r0 #r0=0
li r8,$BITS
$CNTLZ. r7,r5 #r7 = num leading 0s in d.
- bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros
+ beq Lppcasm_div2 #proceed if no leading zeros
subf r8,r7,r8 #r8 = BN_num_bits_word(d)
$SHR. r9,r3,r8 #are there any bits above r8'th?
$TR 16,r9,r0 #if there're, signal to dump core...
Lppcasm_div2:
$UCMP 0,r3,r5 #h>=d?
- bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not
+ blt Lppcasm_div3 #goto Lppcasm_div3 if not
subf r3,r5,r3 #h-=d ;
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
cmpi 0,0,r7,0 # is (i == 0)?
- bc BO_IF,CR0_EQ,Lppcasm_div4
+ beq Lppcasm_div4
$SHL r3,r3,r7 # h = (h<< i)
$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
$SHL r5,r5,r7 # d<<=i
@@ -1741,7 +1651,7 @@ Lppcasm_divouterloop:
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
# compute here for innerloop.
$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not
+ bne Lppcasm_div5 # goto Lppcasm_div5 if not
li r8,-1
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
@@ -1762,9 +1672,9 @@ Lppcasm_divinnerloop:
# the following 2 instructions do that
$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
- $UCMP 1,r6,r7 # compare (tl <= r7)
- bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
- bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+ $UCMP cr1,r6,r7 # compare (tl <= r7)
+ bne Lppcasm_divinnerexit
+ ble cr1,Lppcasm_divinnerexit
addi r8,r8,-1 #q--
subf r12,r9,r12 #th -=dh
$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
@@ -1773,14 +1683,14 @@ Lppcasm_divinnerloop:
Lppcasm_divinnerexit:
$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
- $UCMP 1,r4,r11 # compare l and tl
+ $UCMP cr1,r4,r11 # compare l and tl
add r12,r12,r10 # th+=t
- bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
+ bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
addi r12,r12,1 # th++
Lppcasm_div7:
subf r11,r11,r4 #r11=l-tl
- $UCMP 1,r3,r12 #compare h and th
- bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
+ $UCMP cr1,r3,r12 #compare h and th
+ bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
addi r8,r8,-1 # q--
add r3,r5,r3 # h+=d
Lppcasm_div8:
@@ -1791,12 +1701,12 @@ Lppcasm_div8:
# the following 2 instructions will do this.
$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+ bdz Lppcasm_div9 #if (count==0) break ;
$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
b Lppcasm_divouterloop
Lppcasm_div9:
or r3,r8,r0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1822,7 +1732,7 @@ Lppcasm_div9:
# No unrolling done here. Not performance critical.
addic. r5,r5,0 #test r5.
- bc BO_IF,CR0_EQ,Lppcasm_sqr_adios
+ beq Lppcasm_sqr_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
mtctr r5
@@ -1833,9 +1743,9 @@ Lppcasm_sqr_mainloop:
$UMULH r8,r6,r6
$STU r7,$BNSZ(r3)
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+ bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1858,7 +1768,7 @@ Lppcasm_sqr_adios:
xor r0,r0,r0
xor r12,r12,r12 # used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_mw_REM
+ beq Lppcasm_mw_REM
mtctr r7
Lppcasm_mw_LOOP:
#mul(rp[0],ap[0],w,c1);
@@ -1896,11 +1806,11 @@ Lppcasm_mw_LOOP:
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+ bdnz- Lppcasm_mw_LOOP
Lppcasm_mw_REM:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$UMULL r9,r6,r8
@@ -1912,7 +1822,7 @@ Lppcasm_mw_REM:
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[1],ap[1],w,c1);
@@ -1926,7 +1836,7 @@ Lppcasm_mw_REM:
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
@@ -1939,7 +1849,7 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1964,7 +1874,7 @@ Lppcasm_mw_OVER:
xor r0,r0,r0 #r0 = 0
xor r12,r12,r12 #r12 = 0 . used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
+ beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
mtctr r7
Lppcasm_maw_mainloop:
#mul_add(rp[0],ap[0],w,c1);
@@ -2017,11 +1927,11 @@ Lppcasm_maw_mainloop:
$ST r11,`3*$BNSZ`(r3)
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+ bdnz- Lppcasm_maw_mainloop
Lppcasm_maw_leftover:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_maw_adios
+ beq Lppcasm_maw_adios
addi r3,r3,-$BNSZ
addi r4,r4,-$BNSZ
#mul_add(rp[0],ap[0],w,c1);
@@ -2036,7 +1946,7 @@ Lppcasm_maw_leftover:
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[1],ap[1],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
@@ -2048,7 +1958,7 @@ Lppcasm_maw_leftover:
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[2],ap[2],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
@@ -2062,17 +1972,10 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
.align 4
EOF
- $data =~ s/\`([^\`]*)\`/eval $1/gem;
-
- # if some assembler chokes on some simplified mnemonic,
- # this is the spot to fix it up, e.g.:
- # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
- $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
- # assembler X doesn't accept li, load immediate value
- #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
- return($data);
-}
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;
diff --git a/openssl/crypto/bn/asm/sparcv8plus.S b/openssl/crypto/bn/asm/sparcv8plus.S
index 8c56e2e7e..63de1860f 100644
--- a/openssl/crypto/bn/asm/sparcv8plus.S
+++ b/openssl/crypto/bn/asm/sparcv8plus.S
@@ -144,6 +144,19 @@
* }
*/
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+ /* They've said -xarch=v9 at command line */
+ .register %g2,#scratch
+ .register %g3,#scratch
+# define FRAME_SIZE -192
+#elif defined(__GNUC__) && defined(__arch64__)
+ /* They've said -m64 at command line */
+ .register %g2,#scratch
+ .register %g3,#scratch
+# define FRAME_SIZE -192
+#else
+# define FRAME_SIZE -96
+#endif
/*
* GNU assembler can't stand stuw:-(
*/
@@ -619,8 +632,6 @@ bn_sub_words:
* Andy.
*/
-#define FRAME_SIZE -96
-
/*
* Here is register usage map for *all* routines below.
*/
diff --git a/openssl/crypto/bn/asm/x86_64-gcc.c b/openssl/crypto/bn/asm/x86_64-gcc.c
index f13f52dd8..acb0b4011 100644
--- a/openssl/crypto/bn/asm/x86_64-gcc.c
+++ b/openssl/crypto/bn/asm/x86_64-gcc.c
@@ -1,4 +1,5 @@
-#ifdef __SUNPRO_C
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
/*
@@ -54,7 +55,15 @@
* machine.
*/
+#ifdef _WIN64
+#define BN_ULONG unsigned long long
+#else
#define BN_ULONG unsigned long
+#endif
+
+#undef mul
+#undef mul_add
+#undef sqr
/*
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
@@ -97,7 +106,7 @@
: "a"(a) \
: "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -121,7 +130,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -144,7 +153,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0) return;
@@ -175,14 +184,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
return ret;
}
-BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
- ".align 16 \n"
+ ".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
@@ -198,14 +207,14 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
}
#ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
- ".align 16 \n"
+ ".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
@@ -485,7 +494,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[7]=c2;
}
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
@@ -561,7 +570,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
r[15]=c1;
}
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
diff --git a/openssl/crypto/bn/asm/x86_64-mont.pl b/openssl/crypto/bn/asm/x86_64-mont.pl
index c43b69592..3b7a6f243 100644
--- a/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -15,14 +15,18 @@
# respectful 50%. It remains to be seen if loop unrolling and
# dedicated squaring routine can provide further improvement...
-$output=shift;
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
# int bn_mul_mont(
$rp="%rdi"; # BN_ULONG *rp,
@@ -55,13 +59,14 @@ bn_mul_mont:
push %r15
mov ${num}d,${num}d
- lea 2($num),%rax
- mov %rsp,%rbp
- neg %rax
- lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
+ lea 2($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
- mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lprologue:
mov %rdx,$bp # $bp reassigned, remember?
mov ($n0),$n0 # pull n0[0] value
@@ -197,18 +202,129 @@ bn_mul_mont:
dec $j
jge .Lcopy
- mov 8(%rsp,$num,8),%rsp # restore %rsp
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
+ pop %rdi
+ pop %rsi
ret
-.size bn_mul_mont,.-bn_mul_mont
-.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont
+ .rva .LSEH_end_bn_mul_mont
+ .rva .LSEH_info_bn_mul_mont
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont:
+ .byte 9,0,0,0
+ .rva se_handler
___
+}
print $code;
close STDOUT;