From 462f18c7b25fe3e467f837647d07ab0a78aa8d2b Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Sun, 22 Feb 2015 21:39:56 +0100
Subject: Merged origin/release (checked in because wanted to merge new stuff)

---
 openssl/crypto/aes/asm/aes-586.pl | 283 +++++++++++++++++++-------------------
 1 file changed, 145 insertions(+), 138 deletions(-)

(limited to 'openssl/crypto/aes/asm/aes-586.pl')

diff --git a/openssl/crypto/aes/asm/aes-586.pl b/openssl/crypto/aes/asm/aes-586.pl
index 687ed811b..451d0e0ed 100644
--- a/openssl/crypto/aes/asm/aes-586.pl
+++ b/openssl/crypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
 # but exhibits up to 10% improvement on other cores.
 #
 # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
 # This made it possible to implement little-endian variant of the
 # algorithm without modifying the base C code. Motivating factor for
 # the undertaken effort was that it appeared that in tight IA-32
@@ -103,11 +103,12 @@
 # byte for 128-bit key.
 #
 #		ECB encrypt	ECB decrypt	CBC large chunk
-# P4		56[60]		84[100]		23
-# AMD K8	48[44]		70[79]		18
-# PIII		41[50]		61[91]		24
-# Core 2	32[38]		45[70]		18.5
-# Pentium	120		160		77
+# P4		52[54]		83[95]		23
+# AMD K8	46[41]		66[70]		18
+# PIII		41[50]		60[77]		24
+# Core 2	31[36]		45[64]		18.5
+# Atom		76[100]		96[138]		60
+# Pentium	115		150		77
 #
 # Version 4.1 switches to compact S-box even in key schedule setup.
 #
@@ -242,7 +243,7 @@ $vertical_spin=0;	# shift "verticaly" defaults to 0, because of
 
 sub encvert()
 { my ($te,@s) = @_;
-  my $v0 = $acc, $v1 = $key;
+  my ($v0,$v1) = ($acc,$key);
 
 	&mov	($v0,$s[3]);				# copy s3
 	&mov	(&DWP(4,"esp"),$s[2]);			# save s2
@@ -299,7 +300,7 @@ sub encvert()
 # Another experimental routine, which features "horizontal spin," but
 # eliminates one reference to stack. Strangely enough runs slower...
 sub enchoriz()
-{ my $v0 = $key, $v1 = $acc;
+{ my ($v0,$v1) = ($key,$acc);
 
 	&movz	($v0,&LB($s0));			#  3, 2, 1, 0*
 	&rotr	($s2,8);			#  8,11,10, 9
@@ -427,7 +428,7 @@ sub sse_encbody()
 ######################################################################
 
 sub enccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
   while ($#_>5) { pop(@_); $Fn=sub{}; }
   my ($i,$te,@s)=@_;
   my $tmp = $key;
@@ -476,24 +477,25 @@ sub enctransform()
   my $tmp = $tbl;
   my $r2  = $key ;
 
-	&mov	($acc,$s[$i]);
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
-	&shr	($tmp,7);
+	&and	($tmp,$s[$i]);
 	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
-	&sub	($acc,$tmp);
+	&mov	($acc,$tmp);
+	&shr	($tmp,7);
 	&and	($r2,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
+	&sub	($acc,$tmp);
 	&mov	($tmp,$s[$i]);
+	&and	($acc,0x1b1b1b1b);
+	&rotr	($tmp,16);
 	&xor	($acc,$r2);	# r2
+	&mov	($r2,$s[$i]);
 
 	&xor	($s[$i],$acc);	# r0 ^ r2
+	&rotr	($r2,16+8);
+	&xor	($acc,$tmp);
 	&rotl	($s[$i],24);
-	&xor	($s[$i],$acc)	# ROTATE(r2^r0,24) ^ r2
-	&rotr	($tmp,16);
-	&xor	($s[$i],$tmp);
-	&rotr	($tmp,8);
-	&xor	($s[$i],$tmp);
+	&xor	($acc,$r2);
+	&mov	($tmp,0x80808080)	if ($i!=1);
+	&xor	($s[$i],$acc);	# ROTATE(r2^r0,24) ^ r2
 }
 
 &function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
 		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
 		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
 		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+		&mov	($tbl,0x80808080);
 		&enctransform(2);
 		&enctransform(3);
 		&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
 	&pshufw	("mm5","mm4",0x0d);		# 15,14,11,10
 	&movd	("eax","mm1");			#  5, 4, 1, 0
 	&movd	("ebx","mm5");			# 15,14,11,10
+	&mov	($__key,$key);
 
 	&movz	($acc,&LB("eax"));		#  0
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
 	&movz	("edx",&HB("eax"));		#  1
+	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
+	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
+	&movz	($key,&LB("ebx"));		# 10
 	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
-	&shl	("edx",8);			#  1
 	&shr	("eax",16);			#  5, 4
+	&shl	("edx",8);			#  1
 
-	&movz	($acc,&LB("ebx"));		# 10
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
+	&movz	($key,&HB("ebx"));		# 11
 	&shl	($acc,16);			# 10
-	&or	("ecx",$acc);			# 10
 	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
-	&movz	($acc,&HB("ebx"));		# 11
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
+	&or	("ecx",$acc);			# 10
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
+	&movz	($key,&HB("eax"));		#  5
 	&shl	($acc,24);			# 11
-	&or	("edx",$acc);			# 11
 	&shr	("ebx",16);			# 15,14
+	&or	("edx",$acc);			# 11
 
-	&movz	($acc,&HB("eax"));		#  5
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  5
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
+	&movz	($key,&HB("ebx"));		# 15
 	&shl	($acc,8);			#  5
 	&or	("ecx",$acc);			#  5
-	&movz	($acc,&HB("ebx"));		# 15
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 15
+	&movz	($key,&LB("eax"));		#  4
 	&shl	($acc,24);			# 15
 	&or	("ecx",$acc);			# 15
-	&movd	("mm0","ecx");			# t[0] collected
 
-	&movz	($acc,&LB("eax"));		#  4
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  4
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
+	&movz	($key,&LB("ebx"));		# 14
 	&movd	("eax","mm2");			#  7, 6, 3, 2
-	&movz	($acc,&LB("ebx"));		# 14
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
-	&shl	($acc,16);			# 14
+	&movd	("mm0","ecx");			# t[0] collected
+	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 14
+	&movz	($key,&HB("eax"));		#  3
+	&shl	("ecx",16);			# 14
+	&movd	("ebx","mm6");			# 13,12, 9, 8
 	&or	("ecx",$acc);			# 14
 
-	&movd	("ebx","mm6");			# 13,12, 9, 8
-	&movz	($acc,&HB("eax"));		#  3
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  3
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  3
+	&movz	($key,&HB("ebx"));		#  9
 	&shl	($acc,24);			#  3
 	&or	("ecx",$acc);			#  3
-	&movz	($acc,&HB("ebx"));		#  9
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
+	&movz	($key,&LB("ebx"));		#  8
 	&shl	($acc,8);			#  9
+	&shr	("ebx",16);			# 13,12
 	&or	("ecx",$acc);			#  9
-	&movd	("mm1","ecx");			# t[1] collected
 
-	&movz	($acc,&LB("ebx"));		#  8
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  8
-	&shr	("ebx",16);			# 13,12
-	&movz	($acc,&LB("eax"));		#  2
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
-	&shl	($acc,16);			#  2
-	&or	("ecx",$acc);			#  2
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  8
+	&movz	($key,&LB("eax"));		#  2
 	&shr	("eax",16);			#  7, 6
+	&movd	("mm1","ecx");			# t[1] collected
+	&movz	("ecx",&BP(-128,$tbl,$key,1));	#  2
+	&movz	($key,&HB("eax"));		#  7
+	&shl	("ecx",16);			#  2
+	&and	("eax",0xff);			#  6
+	&or	("ecx",$acc);			#  2
 
 	&punpckldq	("mm0","mm1");		# t[0,1] collected
 
-	&movz	($acc,&HB("eax"));		#  7
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
+	&movz	($key,&HB("ebx"));		# 13
 	&shl	($acc,24);			#  7
-	&or	("ecx",$acc);			#  7
-	&and	("eax",0xff);			#  6
+	&and	("ebx",0xff);			# 12
 	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  6
+	&or	("ecx",$acc);			#  7
 	&shl	("eax",16);			#  6
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
 	&or	("edx","eax");			#  6
-	&movz	($acc,&HB("ebx"));		# 13
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
 	&shl	($acc,8);			# 13
-	&or	("ecx",$acc);			# 13
-	&movd	("mm4","ecx");			# t[2] collected
-	&and	("ebx",0xff);			# 12
 	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	# 12
+	&or	("ecx",$acc);			# 13
 	&or	("edx","ebx");			# 12
+	&mov	($key,$__key);
+	&movd	("mm4","ecx");			# t[2] collected
 	&movd	("mm5","edx");			# t[3] collected
 
 	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -1222,7 +1227,7 @@ sub enclast()
 ######################################################################
 
 sub deccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
   while ($#_>5) { pop(@_); $Fn=sub{}; }
   my ($i,$td,@s)=@_;
   my $tmp = $key;
@@ -1270,30 +1275,30 @@ sub dectransform()
   my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
   my $tp8 = $tbl;
 
-	&mov	($acc,$s[$i]);
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
+	&mov	($tmp,0x80808080);
+	&and	($tmp,$s[$i]);
+	&mov	($acc,$tmp);
 	&shr	($tmp,7);
 	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
 	&sub	($acc,$tmp);
 	&and	($tp2,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
-	&xor	($acc,$tp2);
-	&mov	($tp2,$acc);
+	&xor	($tp2,$acc);
+	&mov	($tmp,0x80808080);
 
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
+	&and	($tmp,$tp2);
+	&mov	($acc,$tmp);
 	&shr	($tmp,7);
 	&lea	($tp4,&DWP(0,$tp2,$tp2));
 	&sub	($acc,$tmp);
 	&and	($tp4,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	 &xor	($tp2,$s[$i]);	# tp2^tp1
-	&xor	($acc,$tp4);
-	&mov	($tp4,$acc);
+	&xor	($tp4,$acc);
+	&mov	($tmp,0x80808080);
 
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
+	&and	($tmp,$tp4);
+	&mov	($acc,$tmp);
 	&shr	($tmp,7);
 	&lea	($tp8,&DWP(0,$tp4,$tp4));
 	&sub	($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()
 
 	&xor	($s[$i],$tp2);
 	&xor	($tp2,$tp8);
-	&rotl	($tp2,24);
 	&xor	($s[$i],$tp4);
 	&xor	($tp4,$tp8);
-	&rotl	($tp4,16);
+	&rotl	($tp2,24);
 	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
-	&rotl	($tp8,8);
+	&rotl	($tp4,16);
 	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
+	&rotl	($tp8,8);
 	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
 	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
 	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
 sub sse_deccompact()
 {
 	&pshufw	("mm1","mm0",0x0c);		#  7, 6, 1, 0
+	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
 	&movd	("eax","mm1");			#  7, 6, 1, 0
+	&movd	("ebx","mm5");			# 13,12,11,10
+	&mov	($__key,$key);
 
-	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
 	&movz	($acc,&LB("eax"));		#  0
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&movd	("ebx","mm5");			# 13,12,11,10
 	&movz	("edx",&HB("eax"));		#  1
+	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
+	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
+	&movz	($key,&LB("ebx"));		# 10
 	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
+	&shr	("eax",16);			#  7, 6
 	&shl	("edx",8);			#  1
 
-	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
-	&movz	($acc,&LB("ebx"));		# 10
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
+	&movz	($key,&HB("ebx"));		# 11
 	&shl	($acc,16);			# 10
+	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
 	&or	("ecx",$acc);			# 10
-	&shr	("eax",16);			#  7, 6
-	&movz	($acc,&HB("ebx"));		# 11
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
+	&movz	($key,&HB("eax"));		#  7
 	&shl	($acc,24);			# 11
-	&or	("edx",$acc);			# 11
 	&shr	("ebx",16);			# 13,12
+	&or	("edx",$acc);			# 11
 
-	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
-	&movz	($acc,&HB("eax"));		#  7
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
+	&movz	($key,&HB("ebx"));		# 13
 	&shl	($acc,24);			#  7
 	&or	("ecx",$acc);			#  7
-	&movz	($acc,&HB("ebx"));		# 13
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
+	&movz	($key,&LB("eax"));		#  6
 	&shl	($acc,8);			# 13
+	&movd	("eax","mm2");			#  3, 2, 5, 4
 	&or	("ecx",$acc);			# 13
-	&movd	("mm0","ecx");			# t[0] collected
 
-	&movz	($acc,&LB("eax"));		#  6
-	&movd	("eax","mm2");			#  3, 2, 5, 4
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  6
-	&shl	("ecx",16);			#  6
-	&movz	($acc,&LB("ebx"));		# 12
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  6
+	&movz	($key,&LB("ebx"));		# 12
+	&shl	($acc,16);			#  6
 	&movd	("ebx","mm6");			#  9, 8,15,14
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 12
+	&movd	("mm0","ecx");			# t[0] collected
+	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 12
+	&movz	($key,&LB("eax"));		#  4
 	&or	("ecx",$acc);			# 12
 
-	&movz	($acc,&LB("eax"));		#  4
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  4
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
+	&movz	($key,&LB("ebx"));		# 14
 	&or	("edx",$acc);			#  4
-	&movz	($acc,&LB("ebx"));		# 14
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
+	&movz	($acc,&BP(-128,$tbl,$key,1));	# 14
+	&movz	($key,&HB("eax"));		#  5
 	&shl	($acc,16);			# 14
+	&shr	("eax",16);			#  3, 2
 	&or	("edx",$acc);			# 14
-	&movd	("mm1","edx");			# t[1] collected
 
-	&movz	($acc,&HB("eax"));		#  5
-	&movz	("edx",&BP(-128,$tbl,$acc,1));	#  5
-	&shl	("edx",8);			#  5
-	&movz	($acc,&HB("ebx"));		# 15
-	&shr	("eax",16);			#  3, 2
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
-	&shl	($acc,24);			# 15
-	&or	("edx",$acc);			# 15
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
+	&movz	($key,&HB("ebx"));		# 15
 	&shr	("ebx",16);			#  9, 8
+	&shl	($acc,8);			#  5
+	&movd	("mm1","edx");			# t[1] collected
+	&movz	("edx",&BP(-128,$tbl,$key,1));	# 15
+	&movz	($key,&HB("ebx"));		#  9
+	&shl	("edx",24);			# 15
+	&and	("ebx",0xff);			#  8
+	&or	("edx",$acc);			# 15
 
 	&punpckldq	("mm0","mm1");		# t[0,1] collected
 
-	&movz	($acc,&HB("ebx"));		#  9
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
+	&movz	($key,&LB("eax"));		#  2
 	&shl	($acc,8);			#  9
-	&or	("ecx",$acc);			#  9
-	&and	("ebx",0xff);			#  8
+	&movz	("eax",&HB("eax"));		#  3
 	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	#  8
+	&or	("ecx",$acc);			#  9
+	&movz	($acc,&BP(-128,$tbl,$key,1));	#  2
 	&or	("edx","ebx");			#  8
-	&movz	($acc,&LB("eax"));		#  2
-	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
 	&shl	($acc,16);			#  2
-	&or	("edx",$acc);			#  2
-	&movd	("mm4","edx");			# t[2] collected
-	&movz	("eax",&HB("eax"));		#  3
 	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  3
+	&or	("edx",$acc);			#  2
 	&shl	("eax",24);			#  3
 	&or	("ecx","eax");			#  3
+	&mov	($key,$__key);
+	&movd	("mm4","edx");			# t[2] collected
 	&movd	("mm5","ecx");			# t[3] collected
 
 	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp");	# copy of aes_key->rounds
 	&mov	("ecx",240/4);
 	&xor	("eax","eax");
 	&align	(4);
-	&data_word(0xABF3F689);	# rep stosd
-	&set_label("skip_ezero")
+	&data_word(0xABF3F689);		# rep stosd
+	&set_label("skip_ezero");
 	&mov	("esp",$_esp);
 	&popf	();
     &set_label("drop_out");
@@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp");	# copy of aes_key->rounds
 	&mov	("ecx",240/4);
 	&xor	("eax","eax");
 	&align	(4);
-	&data_word(0xABF3F689);	# rep stosd
-	&set_label("skip_dzero")
+	&data_word(0xABF3F689);		# rep stosd
+	&set_label("skip_dzero");
 	&mov	("esp",$_esp);
 	&popf	();
 	&function_end_A();
@@ -2865,32 +2872,32 @@ sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
   my $tmp = $tbl;
 
-	&mov	($acc,$tp1);
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
-	&shr	($tmp,7);
+	&mov	($tmp,0x80808080);
+	&and	($tmp,$tp1);
 	&lea	($tp2,&DWP(0,$tp1,$tp1));
+	&mov	($acc,$tmp);
+	&shr	($tmp,7);
 	&sub	($acc,$tmp);
 	&and	($tp2,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
-	&xor	($acc,$tp2);
-	&mov	($tp2,$acc);
+	&xor	($tp2,$acc);
+	&mov	($tmp,0x80808080);
 
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
-	&shr	($tmp,7);
+	&and	($tmp,$tp2);
 	&lea	($tp4,&DWP(0,$tp2,$tp2));
+	&mov	($acc,$tmp);
+	&shr	($tmp,7);
 	&sub	($acc,$tmp);
 	&and	($tp4,0xfefefefe);
 	&and	($acc,0x1b1b1b1b);
 	 &xor	($tp2,$tp1);	# tp2^tp1
-	&xor	($acc,$tp4);
-	&mov	($tp4,$acc);
+	&xor	($tp4,$acc);
+	&mov	($tmp,0x80808080);
 
-	&and	($acc,0x80808080);
-	&mov	($tmp,$acc);
-	&shr	($tmp,7);
+	&and	($tmp,$tp4);
 	&lea	($tp8,&DWP(0,$tp4,$tp4));
+	&mov	($acc,$tmp);
+	&shr	($tmp,7);
 	 &xor	($tp4,$tp1);	# tp4^tp1
 	&sub	($acc,$tmp);
 	&and	($tp8,0xfefefefe);
-- 
cgit v1.2.3