8 files changed, 380 insertions, 843 deletions
diff --git a/openssl/crypto/bn/asm/armv4-mont.pl b/openssl/crypto/bn/asm/armv4-mont.pl
index 05d5dc1a4..14e0d2d1d 100644
--- a/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/openssl/crypto/bn/asm/armv4-mont.pl
@@ -193,6 +193,7 @@ bn_mul_mont:
 	bx	lr			@ interoperable with Thumb ISA:-)
 .size	bn_mul_mont,.-bn_mul_mont
 .asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
 ___
 
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
diff --git a/openssl/crypto/bn/asm/bn-586.pl b/openssl/crypto/bn/asm/bn-586.pl
index 26c2685a7..332ef3e91 100644
--- a/openssl/crypto/bn/asm/bn-586.pl
+++ b/openssl/crypto/bn/asm/bn-586.pl
@@ -1,6 +1,7 @@
 #!/usr/local/bin/perl
 
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
@@ -24,38 +25,25 @@ sub bn_mul_add_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ebp";
-	$r="edi";
-	$c="esi";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-
-	&mov("ecx",&wparam(2));	#
-	&mov($a,&wparam(1));	#
-
-	&and("ecx",0xfffffff8);	# num / 8
-	&mov($w,&wparam(3));	#
-
-	&push("ecx");		# Up the stack for a tmp variable
-
-	&jz(&label("maw_finish"));
+	$r="eax";
+	$a="edx";
+	$c="ecx";
 
 	if ($sse2) {
 		&picmeup("eax","OPENSSL_ia32cap_P");
 		&bt(&DWP(0,"eax"),26);
-		&jnc(&label("maw_loop"));
+		&jnc(&label("maw_non_sse2"));
 
-		&movd("mm0",$w);		# mm0 = w
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
 		&pxor("mm1","mm1");		# mm1 = carry_in
-
-		&set_label("maw_sse2_loop",0);
+		&jmp(&label("maw_sse2_entry"));
+		
+	&set_label("maw_sse2_unrolled",16);
 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
@@ -112,42 +100,82 @@ sub bn_mul_add_words
 		&psrlq("mm1",32);		# mm1 = carry6
 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
 		&movd(&DWP(28,$r,"",0),"mm1");
-		&add($r,32);
+		&lea($r,&DWP(32,$r));
 		&psrlq("mm1",32);		# mm1 = carry_out
 
-		&sub("ecx",8);
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
 		&jnz(&label("maw_sse2_loop"));
-
-		&movd($c,"mm1");		# c = carry_out
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
 		&emms();
+		&ret();
 
-		&jmp(&label("maw_finish"));
+	&set_label("maw_non_sse2",16);
 	}
 
-	&set_label("maw_loop",0);
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
 
-	&mov(&swtmp(0),"ecx");	#
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
 
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 &mov("eax",&DWP($i,$a)); 	# *a
 		&mul($w);			# *a * w
-		&add("eax",$c);		# L(t)+= *r
-		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+		&add("eax",$c);			# L(t)+= c
 		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);		# L(t)+=c
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 
 	&comment("");
-	&mov("ecx",&swtmp(0));	#
-	&add($a,32);
-	&add($r,32);
 	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
 	&jnz(&label("maw_loop"));
 
 	&set_label("maw_finish",0);
@@ -160,16 +188,15 @@ sub bn_mul_add_words
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		 &mov("eax",&DWP($i*4,$a));	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
-		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &dec("ecx") if ($i != 7-1);
-		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
-		 &mov($c,"edx");			# c=  H(t);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
 		&jz(&label("maw_end")) if ($i != 7-1);
 		}
 	&set_label("maw_end",0);
@@ -184,7 +211,45 @@ sub bn_mul_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
 
 	&comment("");
 	$Low="eax";
@@ -257,7 +322,40 @@ sub bn_sqr_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
 
 	&comment("");
 	$r="esi";
@@ -313,12 +411,13 @@ sub bn_div_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,"");
 	&mov("edx",&wparam(0));	#
 	&mov("eax",&wparam(1));	#
-	&mov("ebx",&wparam(2));	#
-	&div("ebx");
-	&function_end($name);
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
 	}
 
 sub bn_add_words
diff --git a/openssl/crypto/bn/asm/co-586.pl b/openssl/crypto/bn/asm/co-586.pl
index 5d962cb95..57101a6bd 100644
--- a/openssl/crypto/bn/asm/co-586.pl
+++ b/openssl/crypto/bn/asm/co-586.pl
@@ -1,6 +1,7 @@
 #!/usr/local/bin/perl
 
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
diff --git a/openssl/crypto/bn/asm/mo-586.pl b/openssl/crypto/bn/asm/mo-586.pl
deleted file mode 100644
index 098229309..000000000
--- a/openssl/crypto/bn/asm/mo-586.pl
+++ /dev/null
@@ -1,603 +0,0 @@
-#!/usr/bin/env perl
-
-# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
-# from OpenSSL 0.9.9-dev 
-
-sub ::asciz
-{ my @str=unpack("C*",shift);
-    push @str,0;
-    while ($#str>15) {
-	&data_byte(@str[0..15]);
-	foreach (0..15) { shift @str; }
-    }
-    &data_byte(@str) if (@str);
-}
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# October 2005
-#
-# This is a "teaser" code, as it can be improved in several ways...
-# First of all non-SSE2 path should be implemented (yes, for now it
-# performs Montgomery multiplication/convolution only on SSE2-capable
-# CPUs such as P4, others fall down to original code). Then inner loop
-# can be unrolled and modulo-scheduled to improve ILP and possibly
-# moved to 128-bit XMM register bank (though it would require input
-# rearrangement and/or increase bus bandwidth utilization). Dedicated
-# squaring procedure should give further performance improvement...
-# Yet, for being draft, the code improves rsa512 *sign* benchmark by
-# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
-
-# December 2006
-#
-# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
-# Integer-only code [being equipped with dedicated squaring procedure]
-# gives ~40% on rsa512 sign benchmark...
-
-push(@INC,"perlasm","../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],$0);
-
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
-&function_begin("bn_mul_mont");
-
-$i="edx";
-$j="ecx";
-$ap="esi";	$tp="esi";		# overlapping variables!!!
-$rp="edi";	$bp="edi";		# overlapping variables!!!
-$np="ebp";
-$num="ebx";
-
-$_num=&DWP(4*0,"esp");			# stack top layout
-$_rp=&DWP(4*1,"esp");
-$_ap=&DWP(4*2,"esp");
-$_bp=&DWP(4*3,"esp");
-$_np=&DWP(4*4,"esp");
-$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
-$_sp=&DWP(4*6,"esp");
-$_bpend=&DWP(4*7,"esp");
-$frame=32;				# size of above frame rounded up to 16n
-
-	&xor	("eax","eax");
-	&mov	("edi",&wparam(5));	# int num
-	&cmp	("edi",4);
-	&jl	(&label("just_leave"));
-
-	&lea	("esi",&wparam(0));	# put aside pointer to argument block
-	&lea	("edx",&wparam(1));	# load ap
-	&mov	("ebp","esp");		# saved stack pointer!
-	&add	("edi",2);		# extra two words on top of tp
-	&neg	("edi");
-	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
-	&neg	("edi");
-
-	# minimize cache contention by arraning 2K window between stack
-	# pointer and ap argument [np is also position sensitive vector,
-	# but it's assumed to be near ap, as it's allocated at ~same
-	# time].
-	&mov	("eax","esp");
-	&sub	("eax","edx");
-	&and	("eax",2047);
-	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
-
-	&xor	("edx","esp");
-	&and	("edx",2048);
-	&xor	("edx",2048);
-	&sub	("esp","edx");		# this splits them apart modulo 4096
-
-	&and	("esp",-64);		# align to cache line
-
-	################################# load argument block...
-	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
-	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
-	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
-	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
-	#&mov	("edi",&DWP(5*4,"esi"));# int num
-
-	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
-	&mov	($_rp,"eax");		# ... save a copy of argument block
-	&mov	($_ap,"ebx");
-	&mov	($_bp,"ecx");
-	&mov	($_np,"edx");
-	&mov	($_n0,"esi");
-	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
-	#&mov	($_num,$num);		# redundant as $num is not reused
-	&mov	($_sp,"ebp");		# saved stack pointer!
-
-if($sse2) {
-$acc0="mm0";	# mmx register bank layout
-$acc1="mm1";
-$car0="mm2";
-$car1="mm3";
-$mul0="mm4";
-$mul1="mm5";
-$temp="mm6";
-$mask="mm7";
-
-	&picmeup("eax","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"eax"),26);
-	&jnc	(&label("non_sse2"));
-
-	&mov	("eax",-1);
-	&movd	($mask,"eax");		# mask 32 lower bits
-
-	&mov	($ap,$_ap);		# load input pointers
-	&mov	($bp,$_bp);
-	&mov	($np,$_np);
-
-	&xor	($i,$i);		# i=0
-	&xor	($j,$j);		# j=0
-
-	&movd	($mul0,&DWP(0,$bp));		# bp[0]
-	&movd	($mul1,&DWP(0,$ap));		# ap[0]
-	&movd	($car1,&DWP(0,$np));		# np[0]
-
-	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
-	&movq	($car0,$mul1);
-	&movq	($acc0,$mul1);			# I wish movd worked for
-	&pand	($acc0,$mask);			# inter-register transfers
-
-	&pmuludq($mul1,$_n0q);			# *=n0
-
-	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
-	&paddq	($car1,$acc0);
-
-	&movd	($acc1,&DWP(4,$np));		# np[1]
-	&movd	($acc0,&DWP(4,$ap));		# ap[1]
-
-	&psrlq	($car0,32);
-	&psrlq	($car1,32);
-
-	&inc	($j);				# j++
-&set_label("1st",16);
-	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
-	&pmuludq($acc1,$mul1);			# np[j]*m1
-	&paddq	($car0,$acc0);			# +=c0
-	&paddq	($car1,$acc1);			# +=c1
-
-	&movq	($acc0,$car0);
-	&pand	($acc0,$mask);
-	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
-	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
-	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
-	&psrlq	($car0,32);
-	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
-	&psrlq	($car1,32);
-
-	&lea	($j,&DWP(1,$j));
-	&cmp	($j,$num);
-	&jl	(&label("1st"));
-
-	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
-	&pmuludq($acc1,$mul1);			# np[num-1]*m1
-	&paddq	($car0,$acc0);			# +=c0
-	&paddq	($car1,$acc1);			# +=c1
-
-	&movq	($acc0,$car0);
-	&pand	($acc0,$mask);
-	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
-	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
-
-	&psrlq	($car0,32);
-	&psrlq	($car1,32);
-
-	&paddq	($car1,$car0);
-	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
-
-	&inc	($i);				# i++
-&set_label("outer");
-	&xor	($j,$j);			# j=0
-
-	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
-	&movd	($mul1,&DWP(0,$ap));		# ap[0]
-	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
-	&movd	($car1,&DWP(0,$np));		# np[0]
-	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
-
-	&paddq	($mul1,$temp);			# +=tp[0]
-	&movq	($acc0,$mul1);
-	&movq	($car0,$mul1);
-	&pand	($acc0,$mask);
-
-	&pmuludq($mul1,$_n0q);			# *=n0
-
-	&pmuludq($car1,$mul1);
-	&paddq	($car1,$acc0);
-
-	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
-	&movd	($acc1,&DWP(4,$np));		# np[1]
-	&movd	($acc0,&DWP(4,$ap));		# ap[1]
-
-	&psrlq	($car0,32);
-	&psrlq	($car1,32);
-	&paddq	($car0,$temp);			# +=tp[1]
-
-	&inc	($j);				# j++
-	&dec	($num);
-&set_label("inner");
-	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
-	&pmuludq($acc1,$mul1);			# np[j]*m1
-	&paddq	($car0,$acc0);			# +=c0
-	&paddq	($car1,$acc1);			# +=c1
-
-	&movq	($acc0,$car0);
-	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
-	&pand	($acc0,$mask);
-	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
-	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
-	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
-	&psrlq	($car0,32);
-	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
-	&psrlq	($car1,32);
-	&paddq	($car0,$temp);			# +=tp[j+1]
-
-	&dec	($num);
-	&lea	($j,&DWP(1,$j));		# j++
-	&jnz	(&label("inner"));
-
-	&mov	($num,$j);
-	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
-	&pmuludq($acc1,$mul1);			# np[num-1]*m1
-	&paddq	($car0,$acc0);			# +=c0
-	&paddq	($car1,$acc1);			# +=c1
-
-	&movq	($acc0,$car0);
-	&pand	($acc0,$mask);
-	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
-	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
-	&psrlq	($car0,32);
-	&psrlq	($car1,32);
-
-	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
-	&paddq	($car1,$car0);
-	&paddq	($car1,$temp);
-	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
-
-	&lea	($i,&DWP(1,$i));		# i++
-	&cmp	($i,$num);
-	&jle	(&label("outer"));
-
-	&emms	();				# done with mmx bank
-	&jmp	(&label("common_tail"));
-
-&set_label("non_sse2",16);
-}
-
-if (0) {
-	&mov	("esp",$_sp);
-	&xor	("eax","eax");	# signal "not fast enough [yet]"
-	&jmp	(&label("just_leave"));
-	# While the below code provides competitive performance for
-	# all key lengthes on modern Intel cores, it's still more
-	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
-	# means compared to the original integer-only assembler.
-	# 512-bit RSA sign is better by ~40%, but that's about all
-	# one can say about all CPUs...
-} else {
-$inp="esi";	# integer path uses these registers differently
-$word="edi";
-$carry="ebp";
-
-	&mov	($inp,$_ap);
-	&lea	($carry,&DWP(1,$num));
-	&mov	($word,$_bp);
-	&xor	($j,$j);				# j=0
-	&mov	("edx",$inp);
-	&and	($carry,1);				# see if num is even
-	&sub	("edx",$word);				# see if ap==bp
-	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
-	&or	($carry,"edx");
-	&mov	($word,&DWP(0,$word));			# bp[0]
-	&jz	(&label("bn_sqr_mont"));
-	&mov	($_bpend,"eax");
-	&mov	("eax",&DWP(0,$inp));
-	&xor	("edx","edx");
-
-&set_label("mull",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*bp[0]
-	&add	($carry,"eax");
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("mull"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*bp[0]
-	 &mov	($word,$_n0);
-	&add	("eax",$carry);
-	 &mov	($inp,$_np);
-	&adc	("edx",0);
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
-	&xor	($j,$j);
-	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
-
-	&mov	("eax",&DWP(0,$inp));			# np[0]
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&adc	("edx",0);
-	&inc	($j);
-
-	&jmp	(&label("2ndmadd"));
-
-&set_label("1stmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*bp[i]
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("1stmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*bp[i]
-	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	 &mov	($word,$_n0);
-	&adc	("edx",0);
-	 &mov	($inp,$_np);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&xor	($j,$j);
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
-	&adc	($j,0);
-	 &mov	("eax",&DWP(0,$inp));			# np[0]
-	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&adc	("edx",0);
-	&mov	($j,1);
-
-&set_label("2ndmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
-	&jl	(&label("2ndmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
-
-	&xor	("eax","eax");
-	 &mov	($j,$_bp);				# &bp[i]
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
-	 &lea	($j,&DWP(4,$j));
-	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
-	 &cmp	($j,$_bpend);
-	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-	&je	(&label("common_tail"));
-
-	&mov	($word,&DWP(0,$j));			# bp[i+1]
-	&mov	($inp,$_ap);
-	&mov	($_bp,$j);				# &bp[++i]
-	&xor	($j,$j);
-	&xor	("edx","edx");
-	&mov	("eax",&DWP(0,$inp));
-	&jmp	(&label("1stmadd"));
-
-&set_label("bn_sqr_mont",16);
-$sbit=$num;
-	&mov	($_num,$num);
-	&mov	($_bp,$j);				# i=0
-
-	&mov	("eax",$word);				# ap[0]
-	&mul	($word);				# ap[0]*ap[0]
-	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
-	&mov	($sbit,"edx");
-	&shr	("edx",1);
-	&and	($sbit,1);
-	&inc	($j);
-&set_label("sqr",16);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*ap[0]
-	&add	("eax",$carry);
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&lea	($carry,&DWP(0,$sbit,"eax",2));
-	&shr	("eax",31);
-	&cmp	($j,$_num);
-	&mov	($sbit,"eax");
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("sqr"));
-
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*ap[0]
-	&add	("eax",$carry);
-	 &mov	($word,$_n0);
-	&adc	("edx",0);
-	 &mov	($inp,$_np);
-	&lea	($carry,&DWP(0,$sbit,"eax",2));
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-	&shr	("eax",31);
-	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
-
-	&lea	($carry,&DWP(0,"eax","edx",2));
-	 &mov	("eax",&DWP(0,$inp));			# np[0]
-	&shr	("edx",31);
-	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	($num,$j);
-	&adc	("edx",0);
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&mov	($j,1);
-
-&set_label("3rdmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j+1]*m
-	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
-	&lea	($j,&DWP(2,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("3rdmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
-
-	&mov	($j,$_bp);				# i
-	&xor	("eax","eax");
-	&mov	($inp,$_ap);
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
-	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
-	&cmp	($j,$num);
-	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-	&je	(&label("common_tail"));
-
-	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
-	&lea	($j,&DWP(1,$j));
-	&mov	("eax",$word);
-	&mov	($_bp,$j);				# ++i
-	&mul	($word);				# ap[i]*ap[i]
-	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
-	&adc	("edx",0);
-	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
-	&xor	($carry,$carry);
-	&cmp	($j,$num);
-	&lea	($j,&DWP(1,$j));
-	&je	(&label("sqrlast"));
-
-	&mov	($sbit,"edx");				# zaps $num
-	&shr	("edx",1);
-	&and	($sbit,1);
-&set_label("sqradd",16);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*ap[i]
-	&add	("eax",$carry);
-	&lea	($carry,&DWP(0,"eax","eax"));
-	&adc	("edx",0);
-	&shr	("eax",31);
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("eax",0);
-	&add	($carry,$sbit);
-	&adc	("eax",0);
-	&cmp	($j,$_num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&mov	($sbit,"eax");
-	&jle	(&label("sqradd"));
-
-	&mov	($carry,"edx");
-	&lea	("edx",&DWP(0,$sbit,"edx",2));
-	&shr	($carry,31);
-&set_label("sqrlast");
-	&mov	($word,$_n0);
-	&mov	($inp,$_np);
-	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
-	&mov	("eax",&DWP(0,$inp));			# np[0]
-	&adc	($carry,0);
-	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&lea	($num,&DWP(-1,$j));
-	&adc	("edx",0);
-	&mov	($j,1);
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-
-	&jmp	(&label("3rdmadd"));
-}
-
-&set_label("common_tail",16);
-	&mov	($np,$_np);			# load modulus pointer
-	&mov	($rp,$_rp);			# load result pointer
-	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
-
-	&mov	("eax",&DWP(0,$tp));		# tp[0]
-	&mov	($j,$num);			# j=num-1
-	&xor	($i,$i);			# i=0 and clear CF!
-
-&set_label("sub",16);
-	&sbb	("eax",&DWP(0,$np,$i,4));
-	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
-	&dec	($j);				# doesn't affect CF!
-	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
-	&lea	($i,&DWP(1,$i));		# i++
-	&jge	(&label("sub"));
-
-	&sbb	("eax",0);			# handle upmost overflow bit
-	&and	($tp,"eax");
-	&not	("eax");
-	&mov	($np,$rp);
-	&and	($np,"eax");
-	&or	($tp,$np);			# tp=carry?tp:rp
-
-&set_label("copy",16);				# copy or in-place refresh
-	&mov	("eax",&DWP(0,$tp,$num,4));
-	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
-	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
-	&dec	($num);
-	&jge	(&label("copy"));
-
-	&mov	("esp",$_sp);		# pull saved stack pointer
-	&mov	("eax",1);
-&set_label("just_leave");
-&function_end("bn_mul_mont");
-
-&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
diff --git a/openssl/crypto/bn/asm/ppc.pl b/openssl/crypto/bn/asm/ppc.pl
index 08e005347..37c65d351 100644
--- a/openssl/crypto/bn/asm/ppc.pl
+++ b/openssl/crypto/bn/asm/ppc.pl
@@ -100,9 +100,9 @@
 #	me a note at schari@us.ibm.com
 #
 
-$opf = shift;
+$flavour = shift;
 
-if ($opf =~ /32\.s/) {
+if ($flavour =~ /32/) {
 	$BITS=	32;
 	$BNSZ=	$BITS/8;
 	$ISA=	"\"ppc\"";
@@ -125,7 +125,7 @@ if ($opf =~ /32\.s/) {
 	$INSR=	"insrwi";	# insert right
 	$ROTL=	"rotlwi";	# rotate left by immediate
 	$TR=	"tw";		# conditional trap
-} elsif ($opf =~ /64\.s/) {
+} elsif ($flavour =~ /64/) {
 	$BITS=	64;
 	$BNSZ=	$BITS/8;
 	$ISA=	"\"ppc64\"";
@@ -149,93 +149,16 @@ if ($opf =~ /32\.s/) {
 	$INSR=	"insrdi";	# insert right 
 	$ROTL=	"rotldi";	# rotate left by immediate
 	$TR=	"td";		# conditional trap
-} else { die "nonsense $opf"; }
+} else { die "nonsense $flavour"; }
 
-( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
 
-# function entry points from the AIX code
-#
-# There are other, more elegant, ways to handle this. We (IBM) chose
-# this approach as it plays well with scripts we run to 'namespace'
-# OpenSSL .i.e. we add a prefix to all the public symbols so we can
-# co-exist in the same process with other implementations of OpenSSL.
-# 'cleverer' ways of doing these substitutions tend to hide data we
-# need to be obvious.
-#
-my @items = ("bn_sqr_comba4",
-	     "bn_sqr_comba8",
-	     "bn_mul_comba4",
-	     "bn_mul_comba8",
-	     "bn_sub_words",
-	     "bn_add_words",
-	     "bn_div_words",
-	     "bn_sqr_words",
-	     "bn_mul_words",
-	     "bn_mul_add_words");
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-if    ($opf =~ /linux/)	{  do_linux();	}
-elsif ($opf =~ /aix/)	{  do_aix();	}
-elsif ($opf =~ /osx/)	{  do_osx();	}
-else			{  do_bsd();	}
-
-sub do_linux {
-    $d=&data();
-
-    if ($BITS==64) {
-      foreach $t (@items) {
-        $d =~ s/\.$t:/\
-\t.section\t".opd","aw"\
-\t.align\t3\
-\t.globl\t$t\
-$t:\
-\t.quad\t.$t,.TOC.\@tocbase,0\
-\t.size\t$t,24\
-\t.previous\n\
-\t.type\t.$t,\@function\
-\t.globl\t.$t\
-.$t:/g;
-      }
-    }
-    else {
-      foreach $t (@items) {
-        $d=~s/\.$t/$t/g;
-      }
-    }
-    # hide internal labels to avoid pollution of name table...
-    $d=~s/Lppcasm_/.Lppcasm_/gm;
-    print $d;
-}
-
-sub do_aix {
-    # AIX assembler is smart enough to please the linker without
-    # making us do something special...
-    print &data();
-}
-
-# MacOSX 32 bit
-sub do_osx {
-    $d=&data();
-    # Change the bn symbol prefix from '.' to '_'
-    foreach $t (@items) {
-      $d=~s/\.$t/_$t/g;
-    }
-    # Change .machine to something OS X asm will accept
-    $d=~s/\.machine.*/.text/g;
-    $d=~s/\#/;/g; # change comment from '#' to ';'
-    print $d;
-}
-
-# BSD (Untested)
-sub do_bsd {
-    $d=&data();
-    foreach $t (@items) {
-      $d=~s/\.$t/_$t/g;
-    }
-    print $d;
-}
-
-sub data {
-	local($data)=<<EOF;
+$data=<<EOF;
 #--------------------------------------------------------------------
 #
 #
@@ -297,33 +220,20 @@ sub data {
 #
 #	Defines to be used in the assembly code.
 #	
-.set r0,0	# we use it as storage for value of 0
-.set SP,1	# preserved
-.set RTOC,2	# preserved 
-.set r3,3	# 1st argument/return value
-.set r4,4	# 2nd argument/volatile register
-.set r5,5	# 3rd argument/volatile register
-.set r6,6	# ...
-.set r7,7
-.set r8,8
-.set r9,9
-.set r10,10
-.set r11,11
-.set r12,12
-.set r13,13	# not used, nor any other "below" it...
-
-.set BO_IF_NOT,4
-.set BO_IF,12
-.set BO_dCTR_NZERO,16
-.set BO_dCTR_ZERO,18
-.set BO_ALWAYS,20
-.set CR0_LT,0;
-.set CR0_GT,1;
-.set CR0_EQ,2
-.set CR1_FX,4;
-.set CR1_FEX,5;
-.set CR1_VX,6
-.set LR,8
+#.set r0,0	# we use it as storage for value of 0
+#.set SP,1	# preserved
+#.set RTOC,2	# preserved 
+#.set r3,3	# 1st argument/return value
+#.set r4,4	# 2nd argument/volatile register
+#.set r5,5	# 3rd argument/volatile register
+#.set r6,6	# ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13	# not used, nor any other "below" it...
 
 #	Declare function names to be global
 #	NOTE:	For gcc these names MUST be changed to remove
@@ -344,7 +254,7 @@ sub data {
 	
 # .text section
 	
-	.machine	$ISA
+	.machine	"any"
 
 #
 #	NOTE:	The following label name should be changed to
@@ -478,7 +388,7 @@ sub data {
 
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -903,7 +813,7 @@ sub data {
 	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
 
 
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 
 	.long	0x00000000
 
@@ -1055,7 +965,7 @@ sub data {
 
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1591,7 +1501,7 @@ sub data {
 	adde	r10,r10,r9
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1623,7 +1533,7 @@ sub data {
 	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
 				# if r6 > 0 then result !=0
 				# In either case carry bit is set.
-	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios
+	beq	Lppcasm_sub_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	addi	r5,r5,-$BNSZ
@@ -1635,11 +1545,11 @@ Lppcasm_sub_mainloop:
 				# if carry = 1 this is r7-r8. Else it
 				# is r7-r8 -1 as we need.
 	$STU	r6,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+	bdnz-	Lppcasm_sub_mainloop
 Lppcasm_sub_adios:	
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 
@@ -1670,7 +1580,7 @@ Lppcasm_sub_adios:
 #	check for r6 = 0. Is this needed?
 #
 	addic.	r6,r6,0		#test r6 and clear carry bit.
-	bc	BO_IF,CR0_EQ,Lppcasm_add_adios
+	beq	Lppcasm_add_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	addi	r5,r5,-$BNSZ
@@ -1680,10 +1590,10 @@ Lppcasm_add_mainloop:
 	$LDU	r8,$BNSZ(r5)
 	adde	r8,r7,r8
 	$STU	r8,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+	bdnz-	Lppcasm_add_mainloop
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1707,24 +1617,24 @@ Lppcasm_add_adios:
 #	r5 = d
 	
 	$UCMPI	0,r5,0			# compare r5 and 0
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0
+	bne	Lppcasm_div1		# proceed if d!=0
 	li	r3,-1			# d=0 return -1
-	bclr	BO_ALWAYS,CR0_LT	
+	blr
 Lppcasm_div1:
 	xor	r0,r0,r0		#r0=0
 	li	r8,$BITS
 	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
-	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if no leading zeros
+	beq	Lppcasm_div2		#proceed if no leading zeros
 	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
 	$SHR.	r9,r3,r8		#are there any bits above r8'th?
 	$TR	16,r9,r0		#if there're, signal to dump core...
 Lppcasm_div2:
 	$UCMP	0,r3,r5			#h>=d?
-	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not
+	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
 	subf	r3,r5,r3		#h-=d ; 
 Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
 	cmpi	0,0,r7,0		# is (i == 0)?
-	bc	BO_IF,CR0_EQ,Lppcasm_div4
+	beq	Lppcasm_div4
 	$SHL	r3,r3,r7		# h = (h<< i)
 	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
 	$SHL	r5,r5,r7		# d<<=i
@@ -1741,7 +1651,7 @@ Lppcasm_divouterloop:
 	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
 					# compute here for innerloop.
 	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not
+	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
 
 	li	r8,-1
 	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 
@@ -1762,9 +1672,9 @@ Lppcasm_divinnerloop:
 					# the following 2 instructions do that
 	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
 	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
-	$UCMP	1,r6,r7			# compare (tl <= r7)
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
-	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+	$UCMP	cr1,r6,r7		# compare (tl <= r7)
+	bne	Lppcasm_divinnerexit
+	ble	cr1,Lppcasm_divinnerexit
 	addi	r8,r8,-1		#q--
 	subf	r12,r9,r12		#th -=dh
 	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
@@ -1773,14 +1683,14 @@ Lppcasm_divinnerloop:
 Lppcasm_divinnerexit:
 	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
 	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
-	$UCMP	1,r4,r11		# compare l and tl
+	$UCMP	cr1,r4,r11		# compare l and tl
 	add	r12,r12,r10		# th+=t
-	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7
+	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
 	addi	r12,r12,1		# th++
 Lppcasm_div7:
 	subf	r11,r11,r4		#r11=l-tl
-	$UCMP	1,r3,r12		#compare h and th
-	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
+	$UCMP	cr1,r3,r12		#compare h and th
+	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
 	addi	r8,r8,-1		# q--
 	add	r3,r5,r3		# h+=d
 Lppcasm_div8:
@@ -1791,12 +1701,12 @@ Lppcasm_div8:
 					# the following 2 instructions will do this.
 	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
 	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+	bdz	Lppcasm_div9		#if (count==0) break ;
 	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
 	b	Lppcasm_divouterloop
 Lppcasm_div9:
 	or	r3,r8,r0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1822,7 +1732,7 @@ Lppcasm_div9:
 #	No unrolling done here. Not performance critical.
 
 	addic.	r5,r5,0			#test r5.
-	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios
+	beq	Lppcasm_sqr_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	mtctr	r5
@@ -1833,9 +1743,9 @@ Lppcasm_sqr_mainloop:
 	$UMULH  r8,r6,r6
 	$STU	r7,$BNSZ(r3)
 	$STU	r8,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 
@@ -1858,7 +1768,7 @@ Lppcasm_sqr_adios:
 	xor	r0,r0,r0
 	xor	r12,r12,r12		# used for carry
 	rlwinm.	r7,r5,30,2,31		# num >> 2
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM
+	beq	Lppcasm_mw_REM
 	mtctr	r7
 Lppcasm_mw_LOOP:	
 					#mul(rp[0],ap[0],w,c1);
@@ -1896,11 +1806,11 @@ Lppcasm_mw_LOOP:
 	
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+	bdnz-	Lppcasm_mw_LOOP
 
 Lppcasm_mw_REM:
 	andi.	r5,r5,0x3
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 					#mul(rp[0],ap[0],w,c1);
 	$LD	r8,`0*$BNSZ`(r4)
 	$UMULL	r9,r6,r8
@@ -1912,7 +1822,7 @@ Lppcasm_mw_REM:
 	
 	addi	r5,r5,-1
 	cmpli	0,0,r5,0
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 
 	
 					#mul(rp[1],ap[1],w,c1);
@@ -1926,7 +1836,7 @@ Lppcasm_mw_REM:
 	
 	addi	r5,r5,-1
 	cmpli	0,0,r5,0
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 	
 					#mul_add(rp[2],ap[2],w,c1);
 	$LD	r8,`2*$BNSZ`(r4)
@@ -1939,7 +1849,7 @@ Lppcasm_mw_REM:
 		
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1964,7 +1874,7 @@ Lppcasm_mw_OVER:
 	xor	r0,r0,r0		#r0 = 0
 	xor	r12,r12,r12  		#r12 = 0 . used for carry		
 	rlwinm.	r7,r5,30,2,31		# num >> 2
-	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
+	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
 	mtctr	r7
 Lppcasm_maw_mainloop:	
 					#mul_add(rp[0],ap[0],w,c1);
@@ -2017,11 +1927,11 @@ Lppcasm_maw_mainloop:
 	$ST	r11,`3*$BNSZ`(r3)
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+	bdnz-	Lppcasm_maw_mainloop
 	
 Lppcasm_maw_leftover:
 	andi.	r5,r5,0x3
-	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios
+	beq	Lppcasm_maw_adios
 	addi	r3,r3,-$BNSZ
 	addi	r4,r4,-$BNSZ
 					#mul_add(rp[0],ap[0],w,c1);
@@ -2036,7 +1946,7 @@ Lppcasm_maw_leftover:
 	addze	r12,r10
 	$ST	r9,0(r3)
 	
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+	bdz	Lppcasm_maw_adios
 					#mul_add(rp[1],ap[1],w,c1);
 	$LDU	r8,$BNSZ(r4)	
 	$UMULL	r9,r6,r8
@@ -2048,7 +1958,7 @@ Lppcasm_maw_leftover:
 	addze	r12,r10
 	$ST	r9,0(r3)
 	
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+	bdz	Lppcasm_maw_adios
 					#mul_add(rp[2],ap[2],w,c1);
 	$LDU	r8,$BNSZ(r4)
 	$UMULL	r9,r6,r8
@@ -2062,17 +1972,10 @@ Lppcasm_maw_leftover:
 		
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 	.align	4
 EOF
-	$data =~ s/\`([^\`]*)\`/eval $1/gem;
-
-	# if some assembler chokes on some simplified mnemonic,
-	# this is the spot to fix it up, e.g.:
-	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
-	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
-	# assembler X doesn't accept li, load immediate value
-	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
-	return($data);
-}
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;
diff --git a/openssl/crypto/bn/asm/sparcv8plus.S b/openssl/crypto/bn/asm/sparcv8plus.S
index 8c56e2e7e..63de1860f 100644
--- a/openssl/crypto/bn/asm/sparcv8plus.S
+++ b/openssl/crypto/bn/asm/sparcv8plus.S
@@ -144,6 +144,19 @@
  *	    }
  */
 
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+  /* They've said -xarch=v9 at command line */
+  .register	%g2,#scratch
+  .register	%g3,#scratch
+# define	FRAME_SIZE	-192
+#elif defined(__GNUC__) && defined(__arch64__)
+  /* They've said -m64 at command line */
+  .register	%g2,#scratch
+  .register	%g3,#scratch
+# define	FRAME_SIZE	-192
+#else 
+# define	FRAME_SIZE	-96
+#endif 
 /*
  * GNU assembler can't stand stuw:-(
  */
@@ -619,8 +632,6 @@ bn_sub_words:
  *							Andy.
  */
 
-#define FRAME_SIZE	-96
-
 /*
  * Here is register usage map for *all* routines below.
  */
diff --git a/openssl/crypto/bn/asm/x86_64-gcc.c b/openssl/crypto/bn/asm/x86_64-gcc.c
index f13f52dd8..acb0b4011 100644
--- a/openssl/crypto/bn/asm/x86_64-gcc.c
+++ b/openssl/crypto/bn/asm/x86_64-gcc.c
@@ -1,4 +1,5 @@
-#ifdef __SUNPRO_C
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
 # include "../bn_asm.c"	/* kind of dirty hack for Sun Studio */
 #else
 /*
@@ -54,7 +55,15 @@
  *    machine.
  */
 
+#ifdef _WIN64
+#define BN_ULONG unsigned long long
+#else
 #define BN_ULONG unsigned long
+#endif
+
+#undef mul
+#undef mul_add
+#undef sqr
 
 /*
  * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
@@ -97,7 +106,7 @@
 		: "a"(a)		\
 		: "cc");
 
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
@@ -121,7 +130,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(c1);
 	} 
 
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
@@ -144,7 +153,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(c1);
 	} 
 
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
 	if (n <= 0) return;
 
@@ -175,14 +184,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	return ret;
 }
 
-BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
 { BN_ULONG ret=0,i=0;
 
 	if (n <= 0) return 0;
 
 	asm (
 	"	subq	%2,%2		\n"
-	".align 16			\n"
+	".p2align 4			\n"
 	"1:	movq	(%4,%2,8),%0	\n"
 	"	adcq	(%5,%2,8),%0	\n"
 	"	movq	%0,(%3,%2,8)	\n"
@@ -198,14 +207,14 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 }
 
 #ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
 { BN_ULONG ret=0,i=0;
 
 	if (n <= 0) return 0;
 
 	asm (
 	"	subq	%2,%2		\n"
-	".align 16			\n"
+	".p2align 4			\n"
 	"1:	movq	(%4,%2,8),%0	\n"
 	"	sbbq	(%5,%2,8),%0	\n"
 	"	movq	%0,(%3,%2,8)	\n"
@@ -485,7 +494,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[7]=c2;
 	}
 
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t1,t2;
 	BN_ULONG c1,c2,c3;
@@ -561,7 +570,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 	r[15]=c1;
 	}
 
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t1,t2;
 	BN_ULONG c1,c2,c3;
diff --git a/openssl/crypto/bn/asm/x86_64-mont.pl b/openssl/crypto/bn/asm/x86_64-mont.pl
index c43b69592..3b7a6f243 100644
--- a/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -15,14 +15,18 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
-$output=shift;
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
 
 # int bn_mul_mont(
 $rp="%rdi";	# BN_ULONG *rp,
@@ -55,13 +59,14 @@ bn_mul_mont:
 	push	%r15
 
 	mov	${num}d,${num}d
-	lea	2($num),%rax
-	mov	%rsp,%rbp
-	neg	%rax
-	lea	(%rsp,%rax,8),%rsp	# tp=alloca(8*(num+2))
+	lea	2($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
 	and	\$-1024,%rsp		# minimize TLB usage
 
-	mov	%rbp,8(%rsp,$num,8)	# tp[num+1]=%rsp
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lprologue:
 	mov	%rdx,$bp		# $bp reassigned, remember?
 
 	mov	($n0),$n0		# pull n0[0] value
@@ -197,18 +202,129 @@ bn_mul_mont:
 	dec	$j
 	jge	.Lcopy
 
-	mov	8(%rsp,$num,8),%rsp	# restore %rsp
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
 	pop	%r15
 	pop	%r14
 	pop	%r13
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
+	pop	%rdi
+	pop	%rsi
 	ret
-.size	bn_mul_mont,.-bn_mul_mont
-.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont
+	.rva	.LSEH_end_bn_mul_mont
+	.rva	.LSEH_info_bn_mul_mont
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont:
+	.byte	9,0,0,0
+	.rva	se_handler
 ___
+}
 
 print $code;
 close STDOUT;