From c9aad1ae6227c434d480d1d3aa8eae3c3c910c18 Mon Sep 17 00:00:00 2001 From: marha Date: Sun, 22 Feb 2015 14:43:31 +0100 Subject: Upgraded to openssl-1.0.2 --- openssl/crypto/aes/asm/aes-586.pl | 283 +-- openssl/crypto/aes/asm/aes-armv4.pl | 139 +- openssl/crypto/aes/asm/aes-mips.pl | 815 ++++++-- openssl/crypto/aes/asm/aes-ppc.pl | 113 +- openssl/crypto/aes/asm/aes-x86_64.pl | 250 ++- openssl/crypto/aes/asm/aesni-mb-x86_64.pl | 1395 ++++++++++++++ openssl/crypto/aes/asm/aesni-sha1-x86_64.pl | 1121 +++++++++-- openssl/crypto/aes/asm/aesni-sha256-x86_64.pl | 1708 +++++++++++++++++ openssl/crypto/aes/asm/aesni-x86.pl | 331 ++-- openssl/crypto/aes/asm/aesni-x86_64.pl | 2138 ++++++++++++--------- openssl/crypto/aes/asm/aesp8-ppc.pl | 1942 +++++++++++++++++++ openssl/crypto/aes/asm/aest4-sparcv9.pl | 919 +++++++++ openssl/crypto/aes/asm/aesv8-armx.pl | 962 ++++++++++ openssl/crypto/aes/asm/bsaes-armv7.pl | 2469 +++++++++++++++++++++++++ openssl/crypto/aes/asm/bsaes-x86_64.pl | 74 +- openssl/crypto/aes/asm/vpaes-ppc.pl | 1512 +++++++++++++++ openssl/crypto/aes/asm/vpaes-x86.pl | 110 +- openssl/crypto/aes/asm/vpaes-x86_64.pl | 100 +- 18 files changed, 14649 insertions(+), 1732 deletions(-) create mode 100755 openssl/crypto/aes/asm/aesni-mb-x86_64.pl create mode 100755 openssl/crypto/aes/asm/aesni-sha256-x86_64.pl create mode 100755 openssl/crypto/aes/asm/aesp8-ppc.pl create mode 100755 openssl/crypto/aes/asm/aest4-sparcv9.pl create mode 100755 openssl/crypto/aes/asm/aesv8-armx.pl create mode 100755 openssl/crypto/aes/asm/bsaes-armv7.pl create mode 100755 openssl/crypto/aes/asm/vpaes-ppc.pl (limited to 'openssl/crypto/aes/asm') diff --git a/openssl/crypto/aes/asm/aes-586.pl b/openssl/crypto/aes/asm/aes-586.pl index 687ed811b..451d0e0ed 100644 --- a/openssl/crypto/aes/asm/aes-586.pl +++ b/openssl/crypto/aes/asm/aes-586.pl @@ -39,7 +39,7 @@ # but exhibits up to 10% improvement on other cores. # # Second version is "monolithic" replacement for aes_core.c, which in -# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key. +# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. # This made it possible to implement little-endian variant of the # algorithm without modifying the base C code. Motivating factor for # the undertaken effort was that it appeared that in tight IA-32 @@ -103,11 +103,12 @@ # byte for 128-bit key. # # ECB encrypt ECB decrypt CBC large chunk -# P4 56[60] 84[100] 23 -# AMD K8 48[44] 70[79] 18 -# PIII 41[50] 61[91] 24 -# Core 2 32[38] 45[70] 18.5 -# Pentium 120 160 77 +# P4 52[54] 83[95] 23 +# AMD K8 46[41] 66[70] 18 +# PIII 41[50] 60[77] 24 +# Core 2 31[36] 45[64] 18.5 +# Atom 76[100] 96[138] 60 +# Pentium 115 150 77 # # Version 4.1 switches to compact S-box even in key schedule setup. # @@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of sub encvert() { my ($te,@s) = @_; - my $v0 = $acc, $v1 = $key; + my ($v0,$v1) = ($acc,$key); &mov ($v0,$s[3]); # copy s3 &mov (&DWP(4,"esp"),$s[2]); # save s2 @@ -299,7 +300,7 @@ sub encvert() # Another experimental routine, which features "horizontal spin," but # eliminates one reference to stack. Strangely enough runs slower... sub enchoriz() -{ my $v0 = $key, $v1 = $acc; +{ my ($v0,$v1) = ($key,$acc); &movz ($v0,&LB($s0)); # 3, 2, 1, 0* &rotr ($s2,8); # 8,11,10, 9 @@ -427,7 +428,7 @@ sub sse_encbody() ###################################################################### sub enccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$te,@s)=@_; my $tmp = $key; @@ -476,24 +477,25 @@ sub enctransform() my $tmp = $tbl; my $r2 = $key ; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$s[$i]); &lea ($r2,&DWP(0,$s[$i],$s[$i])); - &sub ($acc,$tmp); + &mov ($acc,$tmp); + &shr ($tmp,7); &and ($r2,0xfefefefe); - &and ($acc,0x1b1b1b1b); + &sub ($acc,$tmp); &mov ($tmp,$s[$i]); + &and ($acc,0x1b1b1b1b); + &rotr ($tmp,16); &xor ($acc,$r2); # r2 + &mov ($r2,$s[$i]); &xor ($s[$i],$acc); # r0 ^ r2 + &rotr ($r2,16+8); + &xor ($acc,$tmp); &rotl ($s[$i],24); - &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2 - &rotr ($tmp,16); - &xor ($s[$i],$tmp); - &rotr ($tmp,8); - &xor ($s[$i],$tmp); + &xor ($acc,$r2); + &mov ($tmp,0x80808080) if ($i!=1); + &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 } &function_begin_B("_x86_AES_encrypt_compact"); @@ -526,6 +528,7 @@ sub enctransform() &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); + &mov ($tbl,0x80808080); &enctransform(2); &enctransform(3); &enctransform(0); @@ -607,82 +610,84 @@ sub sse_enccompact() &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 &movd ("eax","mm1"); # 5, 4, 1, 0 &movd ("ebx","mm5"); # 15,14,11,10 + &mov ($__key,$key); &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 - &shl ("edx",8); # 1 &shr ("eax",16); # 5, 4 + &shl ("edx",8); # 1 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 - &or ("ecx",$acc); # 10 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &or ("ecx",$acc); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 5 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 15,14 + &or ("edx",$acc); # 11 - &movz ($acc,&HB("eax")); # 5 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shl ($acc,8); # 5 &or ("ecx",$acc); # 5 - &movz ($acc,&HB("ebx")); # 15 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&LB("eax")); # 4 &shl ($acc,24); # 15 &or ("ecx",$acc); # 15 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &movd ("eax","mm2"); # 7, 6, 3, 2 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 - &shl ($acc,16); # 14 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 3 + &shl ("ecx",16); # 14 + &movd ("ebx","mm6"); # 13,12, 9, 8 &or ("ecx",$acc); # 14 - &movd ("ebx","mm6"); # 13,12, 9, 8 - &movz ($acc,&HB("eax")); # 3 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 + &movz ($key,&HB("ebx")); # 9 &shl ($acc,24); # 3 &or ("ecx",$acc); # 3 - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("ebx")); # 8 &shl ($acc,8); # 9 + &shr ("ebx",16); # 13,12 &or ("ecx",$acc); # 9 - &movd ("mm1","ecx"); # t[1] collected - &movz ($acc,&LB("ebx")); # 8 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 - &shr ("ebx",16); # 13,12 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 - &shl ($acc,16); # 2 - &or ("ecx",$acc); # 2 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 + &movz ($key,&LB("eax")); # 2 &shr ("eax",16); # 7, 6 + &movd ("mm1","ecx"); # t[1] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 + &movz ($key,&HB("eax")); # 7 + &shl ("ecx",16); # 2 + &and ("eax",0xff); # 6 + &or ("ecx",$acc); # 2 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 - &or ("ecx",$acc); # 7 - &and ("eax",0xff); # 6 + &and ("ebx",0xff); # 12 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 + &or ("ecx",$acc); # 7 &shl ("eax",16); # 6 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &or ("edx","eax"); # 6 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &shl ($acc,8); # 13 - &or ("ecx",$acc); # 13 - &movd ("mm4","ecx"); # t[2] collected - &and ("ebx",0xff); # 12 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 + &or ("ecx",$acc); # 13 &or ("edx","ebx"); # 12 + &mov ($key,$__key); + &movd ("mm4","ecx"); # t[2] collected &movd ("mm5","edx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -1222,7 +1227,7 @@ sub enclast() ###################################################################### sub deccompact() -{ my $Fn = mov; +{ my $Fn = \&mov; while ($#_>5) { pop(@_); $Fn=sub{}; } my ($i,$td,@s)=@_; my $tmp = $key; @@ -1270,30 +1275,30 @@ sub dectransform() my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); my $tp8 = $tbl; - &mov ($acc,$s[$i]); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &mov ($tmp,0x80808080); + &and ($tmp,$s[$i]); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp2); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); + &and ($tmp,$tp4); + &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); @@ -1305,13 +1310,13 @@ sub dectransform() &xor ($s[$i],$tp2); &xor ($tp2,$tp8); - &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); - &rotl ($tp4,16); + &rotl ($tp2,24); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) - &rotl ($tp8,8); + &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) + &rotl ($tp8,8); &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 @@ -1389,85 +1394,87 @@ sub dectransform() sub sse_deccompact() { &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 + &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movd ("eax","mm1"); # 7, 6, 1, 0 + &movd ("ebx","mm5"); # 13,12,11,10 + &mov ($__key,$key); - &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movz ($acc,&LB("eax")); # 0 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 - &movd ("ebx","mm5"); # 13,12,11,10 &movz ("edx",&HB("eax")); # 1 + &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 + &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 + &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 + &shr ("eax",16); # 7, 6 &shl ("edx",8); # 1 - &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 - &movz ($acc,&LB("ebx")); # 10 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 + &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 + &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &or ("ecx",$acc); # 10 - &shr ("eax",16); # 7, 6 - &movz ($acc,&HB("ebx")); # 11 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 + &movz ($key,&HB("eax")); # 7 &shl ($acc,24); # 11 - &or ("edx",$acc); # 11 &shr ("ebx",16); # 13,12 + &or ("edx",$acc); # 11 - &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 - &movz ($acc,&HB("eax")); # 7 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 + &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 - &movz ($acc,&HB("ebx")); # 13 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 + &movz ($key,&LB("eax")); # 6 &shl ($acc,8); # 13 + &movd ("eax","mm2"); # 3, 2, 5, 4 &or ("ecx",$acc); # 13 - &movd ("mm0","ecx"); # t[0] collected - &movz ($acc,&LB("eax")); # 6 - &movd ("eax","mm2"); # 3, 2, 5, 4 - &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 - &shl ("ecx",16); # 6 - &movz ($acc,&LB("ebx")); # 12 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 + &movz ($key,&LB("ebx")); # 12 + &shl ($acc,16); # 6 &movd ("ebx","mm6"); # 9, 8,15,14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 + &movd ("mm0","ecx"); # t[0] collected + &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 + &movz ($key,&LB("eax")); # 4 &or ("ecx",$acc); # 12 - &movz ($acc,&LB("eax")); # 4 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 + &movz ($key,&LB("ebx")); # 14 &or ("edx",$acc); # 4 - &movz ($acc,&LB("ebx")); # 14 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 + &movz ($key,&HB("eax")); # 5 &shl ($acc,16); # 14 + &shr ("eax",16); # 3, 2 &or ("edx",$acc); # 14 - &movd ("mm1","edx"); # t[1] collected - &movz ($acc,&HB("eax")); # 5 - &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 - &shl ("edx",8); # 5 - &movz ($acc,&HB("ebx")); # 15 - &shr ("eax",16); # 3, 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 - &shl ($acc,24); # 15 - &or ("edx",$acc); # 15 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 + &movz ($key,&HB("ebx")); # 15 &shr ("ebx",16); # 9, 8 + &shl ($acc,8); # 5 + &movd ("mm1","edx"); # t[1] collected + &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 + &movz ($key,&HB("ebx")); # 9 + &shl ("edx",24); # 15 + &and ("ebx",0xff); # 8 + &or ("edx",$acc); # 15 &punpckldq ("mm0","mm1"); # t[0,1] collected - &movz ($acc,&HB("ebx")); # 9 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 + &movz ($key,&LB("eax")); # 2 &shl ($acc,8); # 9 - &or ("ecx",$acc); # 9 - &and ("ebx",0xff); # 8 + &movz ("eax",&HB("eax")); # 3 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 + &or ("ecx",$acc); # 9 + &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 &or ("edx","ebx"); # 8 - &movz ($acc,&LB("eax")); # 2 - &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 - &or ("edx",$acc); # 2 - &movd ("mm4","edx"); # t[2] collected - &movz ("eax",&HB("eax")); # 3 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 + &or ("edx",$acc); # 2 &shl ("eax",24); # 3 &or ("ecx","eax"); # 3 + &mov ($key,$__key); + &movd ("mm4","edx"); # t[2] collected &movd ("mm5","ecx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected @@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_ezero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_ezero"); &mov ("esp",$_esp); &popf (); &set_label("drop_out"); @@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds &mov ("ecx",240/4); &xor ("eax","eax"); &align (4); - &data_word(0xABF3F689); # rep stosd - &set_label("skip_dzero") + &data_word(0xABF3F689); # rep stosd + &set_label("skip_dzero"); &mov ("esp",$_esp); &popf (); &function_end_A(); @@ -2865,32 +2872,32 @@ sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; my $tmp = $tbl; - &mov ($acc,$tp1); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &mov ($tmp,0x80808080); + &and ($tmp,$tp1); &lea ($tp2,&DWP(0,$tp1,$tp1)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); - &xor ($acc,$tp2); - &mov ($tp2,$acc); + &xor ($tp2,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp2); &lea ($tp4,&DWP(0,$tp2,$tp2)); + &mov ($acc,$tmp); + &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 - &xor ($acc,$tp4); - &mov ($tp4,$acc); + &xor ($tp4,$acc); + &mov ($tmp,0x80808080); - &and ($acc,0x80808080); - &mov ($tmp,$acc); - &shr ($tmp,7); + &and ($tmp,$tp4); &lea ($tp8,&DWP(0,$tp4,$tp4)); + &mov ($acc,$tmp); + &shr ($tmp,7); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &and ($tp8,0xfefefefe); diff --git a/openssl/crypto/aes/asm/aes-armv4.pl b/openssl/crypto/aes/asm/aes-armv4.pl index 86b86c4a0..4f8917089 100644 --- a/openssl/crypto/aes/asm/aes-armv4.pl +++ b/openssl/crypto/aes/asm/aes-armv4.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,9 +51,23 @@ $key="r11"; $rounds="r12"; $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + .text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else .code 32 +# endif +#endif .type AES_Te,%object .align 5 @@ -167,7 +181,11 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_encrypt +#else + adr r3,AES_encrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -409,11 +427,21 @@ _armv4_AES_encrypt: .align 5 private_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_set_encrypt_key +#else + adr r3,private_AES_set_encrypt_key +#endif teq r0,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt teq r2,#0 +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif moveq r0,#-1 beq .Labrt @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 +#if __ARM_ARCH__>=7 + itt ne @ Thumb2 thing, sanity check in ARM +#endif movne r0,#-1 bne .Labrt @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#216 beq .Ldone @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] +#if __ARM_ARCH__>=7 + itt eq @ Thumb2 thing, sanity check in ARM +#endif subeq r2,$key,#256 beq .Ldone @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key: str $i3,[$key,#-4] b .L256_loop +.align 2 .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .global private_AES_set_decrypt_key @@ -688,34 +731,57 @@ private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr bl _armv4_AES_set_encrypt_key teq r0,#0 - ldrne lr,[sp],#4 @ pop lr + ldr lr,[sp],#4 @ pop lr bne .Labrt - stmdb sp!,{r4-r12} + mov r0,r2 @ AES_set_encrypt_key preserves r2, + mov r1,r2 @ which is AES_KEY *key + b _armv4_AES_set_enc2dec_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key - ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, - mov $key,r2 @ which is AES_KEY *key - mov $i1,r2 - add $i2,r2,$rounds,lsl#4 +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) +.global AES_set_enc2dec_key +.type AES_set_enc2dec_key,%function +.align 5 +AES_set_enc2dec_key: +_armv4_AES_set_enc2dec_key: + stmdb sp!,{r4-r12,lr} + + ldr $rounds,[r0,#240] + mov $i1,r0 @ input + add $i2,r0,$rounds,lsl#4 + mov $key,r1 @ ouput + add $tbl,r1,$rounds,lsl#4 + str $rounds,[r1,#240] + +.Linv: ldr $s0,[$i1],#16 + ldr $s1,[$i1,#-12] + ldr $s2,[$i1,#-8] + ldr $s3,[$i1,#-4] + ldr $t1,[$i2],#-16 + ldr $t2,[$i2,#16+4] + ldr $t3,[$i2,#16+8] + ldr $i3,[$i2,#16+12] + str $s0,[$tbl],#-16 + str $s1,[$tbl,#16+4] + str $s2,[$tbl,#16+8] + str $s3,[$tbl,#16+12] + str $t1,[$key],#16 + str $t2,[$key,#-12] + str $t3,[$key,#-8] + str $i3,[$key,#-4] + teq $i1,$i2 + bne .Linv -.Linv: ldr $s0,[$i1] + ldr $s0,[$i1] ldr $s1,[$i1,#4] ldr $s2,[$i1,#8] ldr $s3,[$i1,#12] - ldr $t1,[$i2] - ldr $t2,[$i2,#4] - ldr $t3,[$i2,#8] - ldr $i3,[$i2,#12] - str $s0,[$i2],#-16 - str $s1,[$i2,#16+4] - str $s2,[$i2,#16+8] - str $s3,[$i2,#16+12] - str $t1,[$i1],#16 - str $t2,[$i1,#-12] - str $t3,[$i1,#-8] - str $i3,[$i1,#-4] - teq $i1,$i2 - bne .Linv + str $s0,[$key] + str $s1,[$key,#4] + str $s2,[$key,#8] + str $s3,[$key,#12] + sub $key,$key,$rounds,lsl#3 ___ $mask80=$i1; $mask1b=$i2; @@ -773,7 +839,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key .type AES_Td,%object .align 5 @@ -883,7 +949,11 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ AES_decrypt +#else + adr r3,AES_decrypt +#endif stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt: ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] and $i3,lr,$s1,lsr#8 + add $s1,$tbl,$s1,lsr#24 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] - ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] + ldrb $s1,[$s1] @ Td4[s1>>24] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] eor $s0,$i1,$s0,lsl#24 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] + add $s2,$tbl,$s2,lsr#24 + ldrb $s2,[$s2] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] eor $s1,$i2,$s1,lsl#16 @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt: ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 + add $s3,$tbl,$s3,lsr#24 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] - ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] + ldrb $s3,[$s3] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 ldr $i1,[$key,#0] eor $s1,$s1,$i2,lsl#8 @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/openssl/crypto/aes/asm/aes-mips.pl b/openssl/crypto/aes/asm/aes-mips.pl index e52395421..4de3ee26b 100644 --- a/openssl/crypto/aes/asm/aes-mips.pl +++ b/openssl/crypto/aes/asm/aes-mips.pl @@ -20,6 +20,13 @@ # thing about this module is its endian neutrality, which means that # it processes data without ever changing byte order... +# September 2012 +# +# Add MIPS32R2 (~10% less instructions) and SmartMIPS ASE (further +# ~25% less instructions) code. Note that there is no run-time switch, +# instead, code path is chosen upon pre-process time, pass -mips32r2 +# or/and -msmartmips. + ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if @@ -47,11 +54,12 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_INS="dins"; $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 @@ -59,6 +67,7 @@ if ($flavour =~ /64|n32/i) { } else { $PTR_ADD="add"; $PTR_SUB="sub"; + $PTR_INS="ins"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -70,7 +79,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; @@ -89,7 +98,11 @@ $code.=<<___; # include #endif -#if !defined(__vxworks) || defined(__pic__) +#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) +#define _MIPS_ARCH_MIPS32R2 +#endif + +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) .option pic2 #endif .set noat @@ -125,6 +138,89 @@ _mips_AES_encrypt: xor $s3,$t3 sub $cnt,1 +#if defined(__mips_smartmips) + ext $i0,$s1,16,8 +.Loop_enc: + ext $i1,$s2,16,8 + ext $i2,$s3,16,8 + ext $i3,$s0,16,8 + lwxs $t0,$i0($Tbl) # Te1[s1>>16] + ext $i0,$s2,8,8 + lwxs $t1,$i1($Tbl) # Te1[s2>>16] + ext $i1,$s3,8,8 + lwxs $t2,$i2($Tbl) # Te1[s3>>16] + ext $i2,$s0,8,8 + lwxs $t3,$i3($Tbl) # Te1[s0>>16] + ext $i3,$s1,8,8 + + lwxs $t4,$i0($Tbl) # Te2[s2>>8] + ext $i0,$s3,0,8 + lwxs $t5,$i1($Tbl) # Te2[s3>>8] + ext $i1,$s0,0,8 + lwxs $t6,$i2($Tbl) # Te2[s0>>8] + ext $i2,$s1,0,8 + lwxs $t7,$i3($Tbl) # Te2[s1>>8] + ext $i3,$s2,0,8 + + lwxs $t8,$i0($Tbl) # Te3[s3] + ext $i0,$s0,24,8 + lwxs $t9,$i1($Tbl) # Te3[s0] + ext $i1,$s1,24,8 + lwxs $t10,$i2($Tbl) # Te3[s1] + ext $i2,$s2,24,8 + lwxs $t11,$i3($Tbl) # Te3[s2] + ext $i3,$s3,24,8 + + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 + + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + xor $t0,$t4 + lwxs $t4,$i0($Tbl) # Te0[s0>>24] + xor $t1,$t5 + lwxs $t5,$i1($Tbl) # Te0[s1>>24] + xor $t2,$t6 + lwxs $t6,$i2($Tbl) # Te0[s2>>24] + xor $t3,$t7 + lwxs $t7,$i3($Tbl) # Te0[s3>>24] + + rotr $t8,$t8,24 + lw $s0,0($key0) + rotr $t9,$t9,24 + lw $s1,4($key0) + rotr $t10,$t10,24 + lw $s2,8($key0) + rotr $t11,$t11,24 + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_enc + ext $i0,$s1,16,8 + + _xtr $i0,$s1,16-2 +#else _xtr $i0,$s1,16-2 .Loop_enc: _xtr $i1,$s2,16-2 @@ -138,19 +234,29 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $t0,0($i0) # Te1[s1>>16] + _xtr $i0,$s2,8-2 + lw $t1,0($i1) # Te1[s2>>16] + _xtr $i1,$s3,8-2 + lw $t2,0($i2) # Te1[s3>>16] + _xtr $i2,$s0,8-2 + lw $t3,0($i3) # Te1[s0>>16] + _xtr $i3,$s1,8-2 +#else lwl $t0,3($i0) # Te1[s1>>16] lwl $t1,3($i1) # Te1[s2>>16] lwl $t2,3($i2) # Te1[s3>>16] lwl $t3,3($i3) # Te1[s0>>16] lwr $t0,2($i0) # Te1[s1>>16] - lwr $t1,2($i1) # Te1[s2>>16] - lwr $t2,2($i2) # Te1[s3>>16] - lwr $t3,2($i3) # Te1[s0>>16] - _xtr $i0,$s2,8-2 + lwr $t1,2($i1) # Te1[s2>>16] _xtr $i1,$s3,8-2 + lwr $t2,2($i2) # Te1[s3>>16] _xtr $i2,$s0,8-2 + lwr $t3,2($i3) # Te1[s0>>16] _xtr $i3,$s1,8-2 +#endif and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -159,19 +265,88 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 +# if defined(_MIPSEL) + lw $t4,0($i0) # Te2[s2>>8] + _xtr $i0,$s3,0-2 + lw $t5,0($i1) # Te2[s3>>8] + _xtr $i1,$s0,0-2 + lw $t6,0($i2) # Te2[s0>>8] + _xtr $i2,$s1,0-2 + lw $t7,0($i3) # Te2[s1>>8] + _xtr $i3,$s2,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lw $t8,0($i0) # Te3[s3] + $PTR_INS $i0,$s0,2,8 + lw $t9,0($i1) # Te3[s0] + $PTR_INS $i1,$s1,2,8 + lw $t10,0($i2) # Te3[s1] + $PTR_INS $i2,$s2,2,8 + lw $t11,0($i3) # Te3[s2] + $PTR_INS $i3,$s3,2,8 +# else + lw $t4,0($i0) # Te2[s2>>8] + $PTR_INS $i0,$s3,2,8 + lw $t5,0($i1) # Te2[s3>>8] + $PTR_INS $i1,$s0,2,8 + lw $t6,0($i2) # Te2[s0>>8] + $PTR_INS $i2,$s1,2,8 + lw $t7,0($i3) # Te2[s1>>8] + $PTR_INS $i3,$s2,2,8 + + lw $t8,0($i0) # Te3[s3] + _xtr $i0,$s0,24-2 + lw $t9,0($i1) # Te3[s0] + _xtr $i1,$s1,24-2 + lw $t10,0($i2) # Te3[s1] + _xtr $i2,$s2,24-2 + lw $t11,0($i3) # Te3[s2] + _xtr $i3,$s3,24-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# endif + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + rotr $t8,$t8,24 + rotr $t9,$t9,24 + rotr $t10,$t10,24 + rotr $t11,$t11,24 +#else lwl $t4,2($i0) # Te2[s2>>8] lwl $t5,2($i1) # Te2[s3>>8] lwl $t6,2($i2) # Te2[s0>>8] lwl $t7,2($i3) # Te2[s1>>8] lwr $t4,1($i0) # Te2[s2>>8] - lwr $t5,1($i1) # Te2[s3>>8] - lwr $t6,1($i2) # Te2[s0>>8] - lwr $t7,1($i3) # Te2[s1>>8] - _xtr $i0,$s3,0-2 + lwr $t5,1($i1) # Te2[s3>>8] _xtr $i1,$s0,0-2 + lwr $t6,1($i2) # Te2[s0>>8] _xtr $i2,$s1,0-2 + lwr $t7,1($i3) # Te2[s1>>8] _xtr $i3,$s2,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -185,14 +360,14 @@ _mips_AES_encrypt: lwl $t10,1($i2) # Te3[s1] lwl $t11,1($i3) # Te3[s2] lwr $t8,0($i0) # Te3[s3] - lwr $t9,0($i1) # Te3[s0] - lwr $t10,0($i2) # Te3[s1] - lwr $t11,0($i3) # Te3[s2] - _xtr $i0,$s0,24-2 + lwr $t9,0($i1) # Te3[s0] _xtr $i1,$s1,24-2 + lwr $t10,0($i2) # Te3[s1] _xtr $i2,$s2,24-2 + lwr $t11,0($i3) # Te3[s2] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -201,24 +376,24 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 lw $t4,0($i0) # Te0[s0>>24] + xor $t1,$t5 lw $t5,0($i1) # Te0[s1>>24] + xor $t2,$t6 lw $t6,0($i2) # Te0[s2>>24] + xor $t3,$t7 lw $t7,0($i3) # Te0[s3>>24] - lw $s0,0($key0) - lw $s1,4($key0) - lw $s2,8($key0) - lw $s3,12($key0) - xor $t0,$t8 + lw $s0,0($key0) xor $t1,$t9 + lw $s1,4($key0) xor $t2,$t10 + lw $s2,8($key0) xor $t3,$t11 + lw $s3,12($key0) xor $t0,$t4 xor $t1,$t5 @@ -234,6 +409,7 @@ _mips_AES_encrypt: .set noreorder bnez $cnt,.Loop_enc _xtr $i0,$s1,16-2 +#endif .set reorder _xtr $i1,$s2,16-2 @@ -248,14 +424,14 @@ _mips_AES_encrypt: $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t0,2($i0) # Te4[s1>>16] - lbu $t1,2($i1) # Te4[s2>>16] - lbu $t2,2($i2) # Te4[s3>>16] - lbu $t3,2($i3) # Te4[s0>>16] - _xtr $i0,$s2,8-2 + lbu $t1,2($i1) # Te4[s2>>16] _xtr $i1,$s3,8-2 + lbu $t2,2($i2) # Te4[s3>>16] _xtr $i2,$s0,8-2 + lbu $t3,2($i3) # Te4[s0>>16] _xtr $i3,$s1,8-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -264,15 +440,44 @@ _mips_AES_encrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) lbu $t4,2($i0) # Te4[s2>>8] + $PTR_INS $i0,$s0,2,8 lbu $t5,2($i1) # Te4[s3>>8] + $PTR_INS $i1,$s1,2,8 lbu $t6,2($i2) # Te4[s0>>8] + $PTR_INS $i2,$s2,2,8 lbu $t7,2($i3) # Te4[s1>>8] + $PTR_INS $i3,$s3,2,8 + lbu $t8,2($i0) # Te4[s0>>24] + _xtr $i0,$s3,0-2 + lbu $t9,2($i1) # Te4[s1>>24] + _xtr $i1,$s0,0-2 + lbu $t10,2($i2) # Te4[s2>>24] + _xtr $i2,$s1,0-2 + lbu $t11,2($i3) # Te4[s3>>24] + _xtr $i3,$s2,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# else + lbu $t4,2($i0) # Te4[s2>>8] _xtr $i0,$s0,24-2 + lbu $t5,2($i1) # Te4[s3>>8] _xtr $i1,$s1,24-2 + lbu $t6,2($i2) # Te4[s0>>8] _xtr $i2,$s2,24-2 + lbu $t7,2($i3) # Te4[s1>>8] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -282,18 +487,76 @@ _mips_AES_encrypt: $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t8,2($i0) # Te4[s0>>24] + $PTR_INS $i0,$s3,2,8 lbu $t9,2($i1) # Te4[s1>>24] + $PTR_INS $i1,$s0,2,8 lbu $t10,2($i2) # Te4[s2>>24] + $PTR_INS $i2,$s1,2,8 lbu $t11,2($i3) # Te4[s3>>24] + $PTR_INS $i3,$s2,2,8 +# endif + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + _ins2 $t0,$t4,8 + lbu $t4,2($i0) # Te4[s3] + _ins2 $t1,$t5,8 + lbu $t5,2($i1) # Te4[s0] + _ins2 $t2,$t6,8 + lbu $t6,2($i2) # Te4[s1] + _ins2 $t3,$t7,8 + lbu $t7,2($i3) # Te4[s2] + + _ins2 $t0,$t8,24 + lw $s0,0($key0) + _ins2 $t1,$t9,24 + lw $s1,4($key0) + _ins2 $t2,$t10,24 + lw $s2,8($key0) + _ins2 $t3,$t11,24 + lw $s3,12($key0) + + _ins2 $t0,$t4,0 + _ins2 $t1,$t5,0 + _ins2 $t2,$t6,0 + _ins2 $t3,$t7,0 +#else + lbu $t4,2($i0) # Te4[s2>>8] + _xtr $i0,$s0,24-2 + lbu $t5,2($i1) # Te4[s3>>8] + _xtr $i1,$s1,24-2 + lbu $t6,2($i2) # Te4[s0>>8] + _xtr $i2,$s2,24-2 + lbu $t7,2($i3) # Te4[s1>>8] + _xtr $i3,$s3,24-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,2($i0) # Te4[s0>>24] _xtr $i0,$s3,0-2 + lbu $t9,2($i1) # Te4[s1>>24] _xtr $i1,$s0,0-2 + lbu $t10,2($i2) # Te4[s2>>24] _xtr $i2,$s1,0-2 + lbu $t11,2($i3) # Te4[s3>>24] _xtr $i3,$s2,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl _ins $t0,16 _ins $t1,16 @@ -306,27 +569,21 @@ _mips_AES_encrypt: _ins $t7,8 xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - $PTR_ADD $i0,$Tbl - $PTR_ADD $i1,$Tbl - $PTR_ADD $i2,$Tbl - $PTR_ADD $i3,$Tbl lbu $t4,2($i0) # Te4[s3] + xor $t1,$t5 lbu $t5,2($i1) # Te4[s0] + xor $t2,$t6 lbu $t6,2($i2) # Te4[s1] + xor $t3,$t7 lbu $t7,2($i3) # Te4[s2] _ins $t8,24 - _ins $t9,24 - _ins $t10,24 - _ins $t11,24 - lw $s0,0($key0) + _ins $t9,24 lw $s1,4($key0) + _ins $t10,24 lw $s2,8($key0) + _ins $t11,24 lw $s3,12($key0) xor $t0,$t8 @@ -343,7 +600,7 @@ _mips_AES_encrypt: xor $t1,$t5 xor $t2,$t6 xor $t3,$t7 - +#endif xor $s0,$t0 xor $s1,$t1 xor $s2,$t2 @@ -455,6 +712,89 @@ _mips_AES_decrypt: xor $s3,$t3 sub $cnt,1 +#if defined(__mips_smartmips) + ext $i0,$s3,16,8 +.Loop_dec: + ext $i1,$s0,16,8 + ext $i2,$s1,16,8 + ext $i3,$s2,16,8 + lwxs $t0,$i0($Tbl) # Td1[s3>>16] + ext $i0,$s2,8,8 + lwxs $t1,$i1($Tbl) # Td1[s0>>16] + ext $i1,$s3,8,8 + lwxs $t2,$i2($Tbl) # Td1[s1>>16] + ext $i2,$s0,8,8 + lwxs $t3,$i3($Tbl) # Td1[s2>>16] + ext $i3,$s1,8,8 + + lwxs $t4,$i0($Tbl) # Td2[s2>>8] + ext $i0,$s1,0,8 + lwxs $t5,$i1($Tbl) # Td2[s3>>8] + ext $i1,$s2,0,8 + lwxs $t6,$i2($Tbl) # Td2[s0>>8] + ext $i2,$s3,0,8 + lwxs $t7,$i3($Tbl) # Td2[s1>>8] + ext $i3,$s0,0,8 + + lwxs $t8,$i0($Tbl) # Td3[s1] + ext $i0,$s0,24,8 + lwxs $t9,$i1($Tbl) # Td3[s2] + ext $i1,$s1,24,8 + lwxs $t10,$i2($Tbl) # Td3[s3] + ext $i2,$s2,24,8 + lwxs $t11,$i3($Tbl) # Td3[s0] + ext $i3,$s3,24,8 + + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 + + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + xor $t0,$t4 + lwxs $t4,$i0($Tbl) # Td0[s0>>24] + xor $t1,$t5 + lwxs $t5,$i1($Tbl) # Td0[s1>>24] + xor $t2,$t6 + lwxs $t6,$i2($Tbl) # Td0[s2>>24] + xor $t3,$t7 + lwxs $t7,$i3($Tbl) # Td0[s3>>24] + + rotr $t8,$t8,24 + lw $s0,0($key0) + rotr $t9,$t9,24 + lw $s1,4($key0) + rotr $t10,$t10,24 + lw $s2,8($key0) + rotr $t11,$t11,24 + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_dec + ext $i0,$s3,16,8 + + _xtr $i0,$s3,16-2 +#else _xtr $i0,$s3,16-2 .Loop_dec: _xtr $i1,$s0,16-2 @@ -468,19 +808,88 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $t0,0($i0) # Td1[s3>>16] + _xtr $i0,$s2,8-2 + lw $t1,0($i1) # Td1[s0>>16] + _xtr $i1,$s3,8-2 + lw $t2,0($i2) # Td1[s1>>16] + _xtr $i2,$s0,8-2 + lw $t3,0($i3) # Td1[s2>>16] + _xtr $i3,$s1,8-2 +#else lwl $t0,3($i0) # Td1[s3>>16] lwl $t1,3($i1) # Td1[s0>>16] lwl $t2,3($i2) # Td1[s1>>16] lwl $t3,3($i3) # Td1[s2>>16] lwr $t0,2($i0) # Td1[s3>>16] - lwr $t1,2($i1) # Td1[s0>>16] - lwr $t2,2($i2) # Td1[s1>>16] - lwr $t3,2($i3) # Td1[s2>>16] - _xtr $i0,$s2,8-2 + lwr $t1,2($i1) # Td1[s0>>16] _xtr $i1,$s3,8-2 + lwr $t2,2($i2) # Td1[s1>>16] _xtr $i2,$s0,8-2 + lwr $t3,2($i3) # Td1[s2>>16] _xtr $i3,$s1,8-2 +#endif + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $t0,$t0,8 + rotr $t1,$t1,8 + rotr $t2,$t2,8 + rotr $t3,$t3,8 +# if defined(_MIPSEL) + lw $t4,0($i0) # Td2[s2>>8] + _xtr $i0,$s1,0-2 + lw $t5,0($i1) # Td2[s3>>8] + _xtr $i1,$s2,0-2 + lw $t6,0($i2) # Td2[s0>>8] + _xtr $i2,$s3,0-2 + lw $t7,0($i3) # Td2[s1>>8] + _xtr $i3,$s0,0-2 + + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lw $t8,0($i0) # Td3[s1] + $PTR_INS $i0,$s0,2,8 + lw $t9,0($i1) # Td3[s2] + $PTR_INS $i1,$s1,2,8 + lw $t10,0($i2) # Td3[s3] + $PTR_INS $i2,$s2,2,8 + lw $t11,0($i3) # Td3[s0] + $PTR_INS $i3,$s3,2,8 +#else + lw $t4,0($i0) # Td2[s2>>8] + $PTR_INS $i0,$s1,2,8 + lw $t5,0($i1) # Td2[s3>>8] + $PTR_INS $i1,$s2,2,8 + lw $t6,0($i2) # Td2[s0>>8] + $PTR_INS $i2,$s3,2,8 + lw $t7,0($i3) # Td2[s1>>8] + $PTR_INS $i3,$s0,2,8 + + lw $t8,0($i0) # Td3[s1] + _xtr $i0,$s0,24-2 + lw $t9,0($i1) # Td3[s2] + _xtr $i1,$s1,24-2 + lw $t10,0($i2) # Td3[s3] + _xtr $i2,$s2,24-2 + lw $t11,0($i3) # Td3[s0] + _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -489,19 +898,30 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif + rotr $t4,$t4,16 + rotr $t5,$t5,16 + rotr $t6,$t6,16 + rotr $t7,$t7,16 + + rotr $t8,$t8,24 + rotr $t9,$t9,24 + rotr $t10,$t10,24 + rotr $t11,$t11,24 +#else lwl $t4,2($i0) # Td2[s2>>8] lwl $t5,2($i1) # Td2[s3>>8] lwl $t6,2($i2) # Td2[s0>>8] lwl $t7,2($i3) # Td2[s1>>8] lwr $t4,1($i0) # Td2[s2>>8] - lwr $t5,1($i1) # Td2[s3>>8] - lwr $t6,1($i2) # Td2[s0>>8] - lwr $t7,1($i3) # Td2[s1>>8] - _xtr $i0,$s1,0-2 + lwr $t5,1($i1) # Td2[s3>>8] _xtr $i1,$s2,0-2 + lwr $t6,1($i2) # Td2[s0>>8] _xtr $i2,$s3,0-2 + lwr $t7,1($i3) # Td2[s1>>8] _xtr $i3,$s0,0-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -515,14 +935,14 @@ _mips_AES_decrypt: lwl $t10,1($i2) # Td3[s3] lwl $t11,1($i3) # Td3[s0] lwr $t8,0($i0) # Td3[s1] - lwr $t9,0($i1) # Td3[s2] - lwr $t10,0($i2) # Td3[s3] - lwr $t11,0($i3) # Td3[s0] - _xtr $i0,$s0,24-2 + lwr $t9,0($i1) # Td3[s2] _xtr $i1,$s1,24-2 + lwr $t10,0($i2) # Td3[s3] _xtr $i2,$s2,24-2 + lwr $t11,0($i3) # Td3[s0] _xtr $i3,$s3,24-2 + and $i0,0x3fc and $i1,0x3fc and $i2,0x3fc @@ -531,27 +951,25 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#endif xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - lw $t4,0($i0) # Td0[s0>>24] + xor $t1,$t5 lw $t5,0($i1) # Td0[s1>>24] + xor $t2,$t6 lw $t6,0($i2) # Td0[s2>>24] + xor $t3,$t7 lw $t7,0($i3) # Td0[s3>>24] - lw $s0,0($key0) - lw $s1,4($key0) - lw $s2,8($key0) - lw $s3,12($key0) - xor $t0,$t8 + lw $s0,0($key0) xor $t1,$t9 + lw $s1,4($key0) xor $t2,$t10 + lw $s2,8($key0) xor $t3,$t11 + lw $s3,12($key0) xor $t0,$t4 xor $t1,$t5 @@ -567,38 +985,39 @@ _mips_AES_decrypt: .set noreorder bnez $cnt,.Loop_dec _xtr $i0,$s3,16-2 +#endif .set reorder lw $t4,1024($Tbl) # prefetch Td4 - lw $t5,1024+32($Tbl) - lw $t6,1024+64($Tbl) - lw $t7,1024+96($Tbl) - lw $t8,1024+128($Tbl) - lw $t9,1024+160($Tbl) - lw $t10,1024+192($Tbl) - lw $t11,1024+224($Tbl) - _xtr $i0,$s3,16 + lw $t5,1024+32($Tbl) _xtr $i1,$s0,16 + lw $t6,1024+64($Tbl) _xtr $i2,$s1,16 + lw $t7,1024+96($Tbl) _xtr $i3,$s2,16 + lw $t8,1024+128($Tbl) and $i0,0xff + lw $t9,1024+160($Tbl) and $i1,0xff + lw $t10,1024+192($Tbl) and $i2,0xff + lw $t11,1024+224($Tbl) and $i3,0xff + $PTR_ADD $i0,$Tbl $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t0,1024($i0) # Td4[s3>>16] - lbu $t1,1024($i1) # Td4[s0>>16] - lbu $t2,1024($i2) # Td4[s1>>16] - lbu $t3,1024($i3) # Td4[s2>>16] - _xtr $i0,$s2,8 + lbu $t1,1024($i1) # Td4[s0>>16] _xtr $i1,$s3,8 + lbu $t2,1024($i2) # Td4[s1>>16] _xtr $i2,$s0,8 + lbu $t3,1024($i3) # Td4[s2>>16] _xtr $i3,$s1,8 + and $i0,0xff and $i1,0xff and $i2,0xff @@ -607,29 +1026,108 @@ _mips_AES_decrypt: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) lbu $t4,1024($i0) # Td4[s2>>8] + $PTR_INS $i0,$s0,0,8 lbu $t5,1024($i1) # Td4[s3>>8] + $PTR_INS $i1,$s1,0,8 lbu $t6,1024($i2) # Td4[s0>>8] + $PTR_INS $i2,$s2,0,8 lbu $t7,1024($i3) # Td4[s1>>8] + $PTR_INS $i3,$s3,0,8 + lbu $t8,1024($i0) # Td4[s0>>24] + _xtr $i0,$s1,0 + lbu $t9,1024($i1) # Td4[s1>>24] + _xtr $i1,$s2,0 + lbu $t10,1024($i2) # Td4[s2>>24] + _xtr $i2,$s3,0 + lbu $t11,1024($i3) # Td4[s3>>24] + _xtr $i3,$s0,0 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl +# else + lbu $t4,1024($i0) # Td4[s2>>8] _xtr $i0,$s0,24 + lbu $t5,1024($i1) # Td4[s3>>8] _xtr $i1,$s1,24 + lbu $t6,1024($i2) # Td4[s0>>8] _xtr $i2,$s2,24 + lbu $t7,1024($i3) # Td4[s1>>8] _xtr $i3,$s3,24 + $PTR_ADD $i0,$Tbl $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl lbu $t8,1024($i0) # Td4[s0>>24] + $PTR_INS $i0,$s1,0,8 lbu $t9,1024($i1) # Td4[s1>>24] + $PTR_INS $i1,$s2,0,8 lbu $t10,1024($i2) # Td4[s2>>24] + $PTR_INS $i2,$s3,0,8 lbu $t11,1024($i3) # Td4[s3>>24] + $PTR_INS $i3,$s0,0,8 +# endif + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins2 $t0,$t4,8 + lbu $t4,1024($i0) # Td4[s1] + _ins2 $t1,$t5,8 + lbu $t5,1024($i1) # Td4[s2] + _ins2 $t2,$t6,8 + lbu $t6,1024($i2) # Td4[s3] + _ins2 $t3,$t7,8 + lbu $t7,1024($i3) # Td4[s0] + + _ins2 $t0,$t8,24 + lw $s0,0($key0) + _ins2 $t1,$t9,24 + lw $s1,4($key0) + _ins2 $t2,$t10,24 + lw $s2,8($key0) + _ins2 $t3,$t11,24 + lw $s3,12($key0) + _ins2 $t0,$t4,0 + _ins2 $t1,$t5,0 + _ins2 $t2,$t6,0 + _ins2 $t3,$t7,0 +#else + lbu $t4,1024($i0) # Td4[s2>>8] + _xtr $i0,$s0,24 + lbu $t5,1024($i1) # Td4[s3>>8] + _xtr $i1,$s1,24 + lbu $t6,1024($i2) # Td4[s0>>8] + _xtr $i2,$s2,24 + lbu $t7,1024($i3) # Td4[s1>>8] + _xtr $i3,$s3,24 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,1024($i0) # Td4[s0>>24] _xtr $i0,$s1,0 + lbu $t9,1024($i1) # Td4[s1>>24] _xtr $i1,$s2,0 + lbu $t10,1024($i2) # Td4[s2>>24] _xtr $i2,$s3,0 + lbu $t11,1024($i3) # Td4[s3>>24] _xtr $i3,$s0,0 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + _ins $t0,16 _ins $t1,16 _ins $t2,16 @@ -641,44 +1139,38 @@ _mips_AES_decrypt: _ins $t7,8 xor $t0,$t4 - xor $t1,$t5 - xor $t2,$t6 - xor $t3,$t7 - - $PTR_ADD $i0,$Tbl - $PTR_ADD $i1,$Tbl - $PTR_ADD $i2,$Tbl - $PTR_ADD $i3,$Tbl lbu $t4,1024($i0) # Td4[s1] + xor $t1,$t5 lbu $t5,1024($i1) # Td4[s2] + xor $t2,$t6 lbu $t6,1024($i2) # Td4[s3] + xor $t3,$t7 lbu $t7,1024($i3) # Td4[s0] _ins $t8,24 - _ins $t9,24 - _ins $t10,24 - _ins $t11,24 - lw $s0,0($key0) + _ins $t9,24 lw $s1,4($key0) + _ins $t10,24 lw $s2,8($key0) + _ins $t11,24 lw $s3,12($key0) - _ins $t4,0 - _ins $t5,0 - _ins $t6,0 - _ins $t7,0 - - xor $t0,$t8 xor $t1,$t9 xor $t2,$t10 xor $t3,$t11 + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + xor $t0,$t4 xor $t1,$t5 xor $t2,$t6 xor $t3,$t7 +#endif xor $s0,$t0 xor $s1,$t1 @@ -791,7 +1283,7 @@ _mips_AES_set_encrypt_key: beqz $inp,.Lekey_done li $t0,-1 beqz $key,.Lekey_done - $PTR_ADD $rcon,$Tbl,1024+256 + $PTR_ADD $rcon,$Tbl,256 .set reorder lwl $rk0,0+$MSB($inp) # load 128 bits @@ -843,10 +1335,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -898,10 +1390,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -957,10 +1449,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sw $rk0,0($key) sw $rk1,4($key) @@ -999,10 +1491,10 @@ _mips_AES_set_encrypt_key: $PTR_ADD $i1,$Tbl $PTR_ADD $i2,$Tbl $PTR_ADD $i3,$Tbl - lbu $i0,1024($i0) - lbu $i1,1024($i1) - lbu $i2,1024($i2) - lbu $i3,1024($i3) + lbu $i0,0($i0) + lbu $i1,0($i1) + lbu $i2,0($i2) + lbu $i3,0($i3) sll $i0,24 sll $i1,16 sll $i2,8 @@ -1064,7 +1556,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Te # PIC-ified 'load address' + la $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1119,7 +1611,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Tbl,AES_Te # PIC-ified 'load address' + la $Tbl,AES_Te4 # PIC-ified 'load address' bal _mips_AES_set_encrypt_key @@ -1190,6 +1682,16 @@ $code.=<<___; xor $tpb,$tp9,$tp2 xor $tpd,$tp9,$tp4 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $tp1,$tpd,16 + xor $tpe,$tp2 + rotr $tp2,$tp9,8 + xor $tpe,$tp1 + rotr $tp4,$tpb,24 + xor $tpe,$tp2 + lw $tp1,4($key) # modulo-scheduled + xor $tpe,$tp4 +#else _ror $tp1,$tpd,16 xor $tpe,$tp2 _ror $tp2,$tpd,-16 @@ -1204,6 +1706,7 @@ $code.=<<___; xor $tpe,$tp1 lw $tp1,4($key) # modulo-scheduled xor $tpe,$tp2 +#endif sub $cnt,1 sw $tpe,0($key) $PTR_ADD $key,4 @@ -1234,7 +1737,7 @@ ___ # Tables are kept in endian-neutral manner $code.=<<___; .rdata -.align 6 +.align 10 AES_Te: .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d @@ -1365,46 +1868,6 @@ AES_Te: .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a -.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 -.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 -.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 -.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 -.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc -.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 -.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a -.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 -.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 -.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 -.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b -.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf -.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 -.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 -.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 -.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 -.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 -.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 -.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 -.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb -.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c -.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 -.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 -.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 -.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 -.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a -.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e -.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e -.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 -.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf -.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 -.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 - -.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon -.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 -.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 -.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 -.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 - -.align 6 AES_Td: .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 @@ -1567,6 +2030,46 @@ AES_Td: .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + +AES_Te4: +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 ___ foreach (split("\n",$code)) { @@ -1583,6 +2086,9 @@ foreach (split("\n",$code)) { s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) : eval("24-$3"))/e or + s/_ins2\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("ins\t$1,$2,%d,8",$big_endian ? eval($3) + : eval("24-$3"))/e or s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) : eval("$3*-1"))/e or @@ -1605,6 +2111,11 @@ foreach (split("\n",$code)) { sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; } + if (!$big_endian) { + s/(rotr\s+\$[0-9]+,\$[0-9]+),([0-9]+)/sprintf("$1,%d",32-$2)/e; + s/(ext\s+\$[0-9]+,\$[0-9]+),([0-9]+),8/sprintf("$1,%d,8",24-$2)/e; + } + print $_,"\n"; } diff --git a/openssl/crypto/aes/asm/aes-ppc.pl b/openssl/crypto/aes/asm/aes-ppc.pl index 7c52cbe5f..7a99fc3d0 100644 --- a/openssl/crypto/aes/asm/aes-ppc.pl +++ b/openssl/crypto/aes/asm/aes-ppc.pl @@ -45,6 +45,8 @@ if ($flavour =~ /64/) { $PUSH ="stw"; } else { die "nonsense $flavour"; } +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -68,7 +70,7 @@ $key="r5"; $Tbl0="r3"; $Tbl1="r6"; $Tbl2="r7"; -$Tbl3="r2"; +$Tbl3=$out; # stay away from "r2"; $out is offloaded to stack $s0="r8"; $s1="r9"; @@ -76,7 +78,7 @@ $s2="r10"; $s3="r11"; $t0="r12"; -$t1="r13"; +$t1="r0"; # stay away from "r13"; $t2="r14"; $t3="r15"; @@ -100,9 +102,6 @@ $acc13="r29"; $acc14="r30"; $acc15="r31"; -# stay away from TLS pointer -if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; } -else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; } $mask80=$Tbl2; $mask1b=$Tbl3; @@ -337,8 +336,7 @@ $code.=<<___; $STU $sp,-$FRAME($sp) mflr r0 - $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) - $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH $out,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) @@ -365,16 +363,61 @@ $code.=<<___; bne Lenc_unaligned Lenc_unaligned_ok: +___ +$code.=<<___ if (!$LITTLE_ENDIAN); lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $t0,0($inp) + lwz $t1,4($inp) + lwz $t2,8($inp) + lwz $t3,12($inp) + rotlwi $s0,$t0,8 + rotlwi $s1,$t1,8 + rotlwi $s2,$t2,8 + rotlwi $s3,$t3,8 + rlwimi $s0,$t0,24,0,7 + rlwimi $s1,$t1,24,0,7 + rlwimi $s2,$t2,24,0,7 + rlwimi $s3,$t3,24,0,7 + rlwimi $s0,$t0,24,16,23 + rlwimi $s1,$t1,24,16,23 + rlwimi $s2,$t2,24,16,23 + rlwimi $s3,$t3,24,16,23 +___ +$code.=<<___; bl LAES_Te bl Lppc_AES_encrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + rotlwi $t0,$s0,8 + rotlwi $t1,$s1,8 + rotlwi $t2,$s2,8 + rotlwi $t3,$s3,8 + rlwimi $t0,$s0,24,0,7 + rlwimi $t1,$s1,24,0,7 + rlwimi $t2,$s2,24,0,7 + rlwimi $t3,$s3,24,0,7 + rlwimi $t0,$s0,24,16,23 + rlwimi $t1,$s1,24,16,23 + rlwimi $t2,$s2,24,16,23 + rlwimi $t3,$s3,24,16,23 + stw $t0,0($out) + stw $t1,4($out) + stw $t2,8($out) + stw $t3,12($out) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) +___ +$code.=<<___; b Lenc_done Lenc_unaligned: @@ -417,6 +460,7 @@ Lenc_xpage: bl LAES_Te bl Lppc_AES_encrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 @@ -449,8 +493,6 @@ Lenc_xpage: Lenc_done: $POP r0,`$FRAME+$LRSAVE`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) @@ -764,6 +806,7 @@ Lenc_compact_done: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +.size .AES_encrypt,.-.AES_encrypt .globl .AES_decrypt .align 7 @@ -771,8 +814,7 @@ Lenc_compact_done: $STU $sp,-$FRAME($sp) mflr r0 - $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) - $PUSH r13,`$FRAME-$SIZE_T*19`($sp) + $PUSH $out,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) @@ -799,16 +841,61 @@ Lenc_compact_done: bne Ldec_unaligned Ldec_unaligned_ok: +___ +$code.=<<___ if (!$LITTLE_ENDIAN); lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $t0,0($inp) + lwz $t1,4($inp) + lwz $t2,8($inp) + lwz $t3,12($inp) + rotlwi $s0,$t0,8 + rotlwi $s1,$t1,8 + rotlwi $s2,$t2,8 + rotlwi $s3,$t3,8 + rlwimi $s0,$t0,24,0,7 + rlwimi $s1,$t1,24,0,7 + rlwimi $s2,$t2,24,0,7 + rlwimi $s3,$t3,24,0,7 + rlwimi $s0,$t0,24,16,23 + rlwimi $s1,$t1,24,16,23 + rlwimi $s2,$t2,24,16,23 + rlwimi $s3,$t3,24,16,23 +___ +$code.=<<___; bl LAES_Td bl Lppc_AES_decrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) +___ +$code.=<<___ if ($LITTLE_ENDIAN); + rotlwi $t0,$s0,8 + rotlwi $t1,$s1,8 + rotlwi $t2,$s2,8 + rotlwi $t3,$s3,8 + rlwimi $t0,$s0,24,0,7 + rlwimi $t1,$s1,24,0,7 + rlwimi $t2,$s2,24,0,7 + rlwimi $t3,$s3,24,0,7 + rlwimi $t0,$s0,24,16,23 + rlwimi $t1,$s1,24,16,23 + rlwimi $t2,$s2,24,16,23 + rlwimi $t3,$s3,24,16,23 + stw $t0,0($out) + stw $t1,4($out) + stw $t2,8($out) + stw $t3,12($out) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) +___ +$code.=<<___; b Ldec_done Ldec_unaligned: @@ -851,6 +938,7 @@ Ldec_xpage: bl LAES_Td bl Lppc_AES_decrypt_compact + $POP $out,`$FRAME-$SIZE_T*19`($sp) extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 @@ -883,8 +971,6 @@ Ldec_xpage: Ldec_done: $POP r0,`$FRAME+$LRSAVE`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) @@ -1355,6 +1441,7 @@ Ldec_compact_done: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +.size .AES_decrypt,.-.AES_decrypt .asciz "AES for PPC, CRYPTOGAMS by " .align 7 diff --git a/openssl/crypto/aes/asm/aes-x86_64.pl b/openssl/crypto/aes/asm/aes-x86_64.pl index 34cbb5d84..47f416375 100644 --- a/openssl/crypto/aes/asm/aes-x86_64.pl +++ b/openssl/crypto/aes/asm/aes-x86_64.pl @@ -19,9 +19,10 @@ # Performance in number of cycles per processed byte for 128-bit key: # # ECB encrypt ECB decrypt CBC large chunk -# AMD64 33 41 13.0 -# EM64T 38 59 18.6(*) -# Core 2 30 43 14.5(*) +# AMD64 33 43 13.0 +# EM64T 38 56 18.6(*) +# Core 2 30 42 14.5(*) +# Atom 65 86 32.1(*) # # (*) with hyper-threading off @@ -366,68 +367,66 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 + shr \$16,$s2 + movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s3")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s0")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 + movzb `&lo("$s2")`,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 shl \$8,$t4 + shr \$16,$s3 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 shr \$16,$s0 + movzb `&lo("$s3")`,$t4 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + xor $t5,$t1 shl \$8,$acc2 - shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 + shl \$8,$acc0 movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 - movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s3")`,$acc0 + movzb ($sbox,$t5,1),$t5 #$t2 xor $acc1,$t0 - movzb `&hi("$s0")`,$acc1 shr \$8,$s2 + movzb `&hi("$s0")`,$acc1 + shl \$16,$t4 shr \$8,$s1 + shl \$16,$t5 + xor $t4,$t1 + movzb ($sbox,$acc2,1),$acc2 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 - shl \$16,$t4 - shl \$16,$t5 + shl \$16,$acc2 - xor $t4,$t1 xor $t5,$t2 - xor $acc2,$t3 - shl \$24,$acc0 + xor $acc2,$t3 shl \$24,$acc1 - shl \$24,$s3 xor $acc0,$t0 - shl \$24,$s2 + shl \$24,$s3 xor $acc1,$t1 + shl \$24,$s2 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 @@ -466,12 +465,12 @@ sub enctransform() { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); $code.=<<___; - mov $s0,$acc0 - mov $s1,$acc1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 - mov $acc0,$t0 - mov $acc1,$t1 + mov \$0x80808080,$t0 + mov \$0x80808080,$t1 + and $s0,$t0 + and $s1,$t1 + mov $t0,$acc0 + mov $t1,$acc1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 @@ -489,25 +488,25 @@ $code.=<<___; xor $r20,$s0 xor $r21,$s1 - mov $s2,$acc0 - mov $s3,$acc1 + mov \$0x80808080,$t2 rol \$24,$s0 + mov \$0x80808080,$t3 rol \$24,$s1 - and \$0x80808080,$acc0 - and \$0x80808080,$acc1 + and $s2,$t2 + and $s3,$t3 xor $r20,$s0 xor $r21,$s1 - mov $acc0,$t2 - mov $acc1,$t3 + mov $t2,$acc0 ror \$16,$t0 + mov $t3,$acc1 ror \$16,$t1 - shr \$7,$t2 lea ($s2,$s2),$r20 + shr \$7,$t2 xor $t0,$s0 - xor $t1,$s1 shr \$7,$t3 - lea ($s3,$s3),$r21 + xor $t1,$s1 ror \$8,$t0 + lea ($s3,$s3),$r21 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 @@ -523,23 +522,23 @@ $code.=<<___; xor $acc0,$r20 xor $acc1,$r21 + ror \$16,$t2 xor $r20,$s2 + ror \$16,$t3 xor $r21,$s3 rol \$24,$s2 + mov 0($sbox),$acc0 # prefetch Te4 rol \$24,$s3 xor $r20,$s2 - xor $r21,$s3 - mov 0($sbox),$acc0 # prefetch Te4 - ror \$16,$t2 - ror \$16,$t3 mov 64($sbox),$acc1 - xor $t2,$s2 - xor $t3,$s3 + xor $r21,$s3 mov 128($sbox),$r20 + xor $t2,$s2 ror \$8,$t2 + xor $t3,$s3 ror \$8,$t3 - mov 192($sbox),$r21 xor $t2,$s2 + mov 192($sbox),$r21 xor $t3,$s3 ___ } @@ -936,70 +935,69 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 - movzb ($sbox,$t0,1),$t0 - movzb ($sbox,$t1,1),$t1 - movzb ($sbox,$t2,1),$t2 - movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 + shr \$16,$s3 + movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 - movzb ($sbox,$acc0,1),$t4 #$t0 - movzb ($sbox,$acc1,1),$t5 #$t1 - movzb `&hi("$s1")`,$acc2 + movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s2")`,$acc0 - shr \$16,$s2 + movzb ($sbox,$acc1,1),$t5 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 - shr \$16,$s3 - movzb `&lo("$s2")`,$acc1 - shl \$8,$t4 + shr \$16,$s2 shl \$8,$t5 - movzb ($sbox,$acc1,1),$acc1 #$t0 - xor $t4,$t0 - xor $t5,$t1 - - movzb `&lo("$s3")`,$t4 + shl \$8,$t4 + movzb `&lo("$s2")`,$acc1 shr \$16,$s0 + xor $t4,$t0 shr \$16,$s1 - movzb `&lo("$s0")`,$t5 + movzb `&lo("$s3")`,$t4 + shl \$8,$acc2 + xor $t5,$t1 shl \$8,$acc0 - movzb ($sbox,$t4,1),$t4 #$t1 - movzb ($sbox,$t5,1),$t5 #$t2 + movzb `&lo("$s0")`,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 - xor $acc0,$t3 - movzb `&lo("$s1")`,$acc2 - movzb `&hi("$s1")`,$acc0 + shl \$16,$acc1 + xor $acc0,$t3 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb `&hi("$s1")`,$acc0 movzb ($sbox,$acc2,1),$acc2 #$t3 - movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 - + movzb ($sbox,$t5,1),$t5 #$t2 movzb `&hi("$s2")`,$acc1 + + shl \$16,$acc2 shl \$16,$t4 shl \$16,$t5 - movzb ($sbox,$acc1,1),$s1 #$t1 + xor $acc2,$t3 + movzb `&hi("$s3")`,$acc2 xor $t4,$t1 + shr \$8,$s0 xor $t5,$t2 - movzb `&hi("$s3")`,$acc1 - shr \$8,$s0 - shl \$16,$acc2 - movzb ($sbox,$acc1,1),$s2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t0 + movzb ($sbox,$acc1,1),$s1 #$t1 + movzb ($sbox,$acc2,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 - xor $acc2,$t3 + mov $t0,$s0 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 - xor $acc0,$t0 + xor $acc0,$s0 shl \$24,$s3 xor $t1,$s1 - mov $t0,$s0 xor $t2,$s2 xor $t3,$s3 ___ @@ -1014,12 +1012,12 @@ sub dectransform() my $prefetch = shift; $code.=<<___; - mov $tp10,$acc0 - mov $tp18,$acc8 - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 + mov $mask80,$tp40 + mov $mask80,$tp48 + and $tp10,$tp40 + and $tp18,$tp48 + mov $tp40,$acc0 + mov $tp48,$acc8 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 @@ -1030,15 +1028,15 @@ $code.=<<___; and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp20,$acc0 - xor $tp28,$acc8 - mov $acc0,$tp20 - mov $acc8,$tp28 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp20 + xor $acc8,$tp28 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp20,$tp80 + and $tp28,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 @@ -1049,15 +1047,15 @@ $code.=<<___; and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 - xor $tp40,$acc0 - xor $tp48,$acc8 - mov $acc0,$tp40 - mov $acc8,$tp48 - - and $mask80,$acc0 - and $mask80,$acc8 - mov $acc0,$tp80 - mov $acc8,$tp88 + xor $acc0,$tp40 + xor $acc8,$tp48 + mov $mask80,$tp80 + mov $mask80,$tp88 + + and $tp40,$tp80 + and $tp48,$tp88 + mov $tp80,$acc0 + mov $tp88,$acc8 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 @@ -1082,51 +1080,51 @@ $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 - xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 + xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 - xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) + xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 + rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 - rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` - xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 + xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 - mov $tp28,$tp88 - shr \$32,$tp80 - shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) + mov $tp28,$tp88 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) - rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) + shr \$32,$tp80 xor `&LO("$tp20")`,`&LO("$tp10")` + shr \$32,$tp88 xor `&LO("$tp28")`,`&LO("$tp18")` + rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) mov $tp40,$tp20 + rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) mov $tp48,$tp28 + shr \$32,$tp20 xor `&LO("$tp80")`,`&LO("$acc0")` + shr \$32,$tp28 xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` - shr \$32,$tp20 - shr \$32,$tp28 - `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) + `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) - rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` + rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` @@ -1302,10 +1300,6 @@ private_AES_set_encrypt_key: call _x86_64_AES_set_encrypt_key - mov 8(%rsp),%r15 - mov 16(%rsp),%r14 - mov 24(%rsp),%r13 - mov 32(%rsp),%r12 mov 40(%rsp),%rbp mov 48(%rsp),%rbx add \$56,%rsp diff --git a/openssl/crypto/aes/asm/aesni-mb-x86_64.pl b/openssl/crypto/aes/asm/aesni-mb-x86_64.pl new file mode 100755 index 000000000..33b1aed3c --- /dev/null +++ b/openssl/crypto/aes/asm/aesni-mb-x86_64.pl @@ -0,0 +1,1395 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer AES-NI procedures process several independent buffers +# in parallel by interleaving independent instructions. +# +# Cycles per byte for interleave factor 4: +# +# asymptotic measured +# --------------------------- +# Westmere 5.00/4=1.25 5.13/4=1.28 +# Atom 15.0/4=3.75 ?15.7/4=3.93 +# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 +# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 +# Haswell 4.44/4=1.11 4.44/4=1.11 +# Bulldozer 5.75/4=1.44 5.76/4=1.44 +# +# Cycles per byte for interleave factor 8 (not implemented for +# pre-AVX processors, where higher interleave factor incidentally +# doesn't result in improvement): +# +# asymptotic measured +# --------------------------- +# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) +# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) +# Haswell 5.00/8=0.63 5.00/8=0.63 +# Bulldozer 5.75/8=0.72 5.77/8=0.72 +# +# (*) Sandy/Ivy Bridge are known to handle high interleave factors +# suboptimally; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void aesni_multi_cbc_encrypt ( +# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; +# const AES_KEY *key, +# int num); /* 1 or 2 */ +# +$inp="%rdi"; # 1st arg +$key="%rsi"; # 2nd arg +$num="%edx"; + +@inptr=map("%r$_",(8..11)); +@outptr=map("%r$_",(12..15)); + +($rndkey0,$rndkey1)=("%xmm0","%xmm1"); +@out=map("%xmm$_",(2..5)); +@inp=map("%xmm$_",(6..9)); +($counters,$mask,$zero)=map("%xmm$_",(10..12)); + +($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,\@function,3 +.align 32 +aesni_multi_cbc_encrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Lenc_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Lenc4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Lenc4x_done + + movups 0x10-0x78($key),$rndkey1 + pxor $zero,@out[0] + movups 0x20-0x78($key),$rndkey0 + pxor $zero,@out[1] + mov 0xf0-0x78($key),$rounds + pxor $zero,@out[2] + movdqu (@inptr[0]),@inp[0] # load inputs + pxor $zero,@out[3] + movdqu (@inptr[1]),@inp[1] + pxor @inp[0],@out[0] + movdqu (@inptr[2]),@inp[2] + pxor @inp[1],@out[1] + movdqu (@inptr[3]),@inp[3] + pxor @inp[2],@out[2] + pxor @inp[3],@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesenc $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesenc $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[2],$offset) + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesenc $rndkey,@out[0] + aesenc $rndkey,@out[1] + aesenc $rndkey,@out[2] + cmovge $sink,@inptr[$i] # cancel input + cmovg $sink,@outptr[$i] # sink output + aesenc $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesenc $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesenc $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesenc $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesenc $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Lenc4x_tail + +.align 32 +.Lenc4x_tail: + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movdqu (@inptr[0],$offset),@inp[0] + movdqu 0x10-0x78($key),$rndkey1 + + aesenclast $rndkey0,@out[0] + movdqu (@inptr[1],$offset),@inp[1] + pxor $zero,@inp[0] + aesenclast $rndkey0,@out[1] + movdqu (@inptr[2],$offset),@inp[2] + pxor $zero,@inp[1] + aesenclast $rndkey0,@out[2] + movdqu (@inptr[3],$offset),@inp[3] + pxor $zero,@inp[2] + aesenclast $rndkey0,@out[3] + movdqu 0x20-0x78($key),$rndkey0 + pxor $zero,@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + pxor @inp[0],@out[0] + movups @out[1],-16(@outptr[1],$offset) + pxor @inp[1],@out[1] + movups @out[2],-16(@outptr[2],$offset) + pxor @inp[2],@out[2] + movups @out[3],-16(@outptr[3],$offset) + pxor @inp[3],@out[3] + + dec $num + jnz .Loop_enc4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + #pxor @inp[0],@out[0] + #pxor @inp[1],@out[1] + #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + #pxor @inp[2],@out[2] + #movdqu @out[1],`40*1+24-40*2`($inp) + #pxor @inp[3],@out[3] + #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller + #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + + lea `40*4`($inp),$inp + dec $num + jnz .Lenc4x_loop_grande + +.Lenc4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + #movaps -0x68(%rax),%xmm13 + #movaps -0x58(%rax),%xmm14 + #movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lenc4x_epilogue: + ret +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,\@function,3 +.align 32 +aesni_multi_cbc_decrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Ldec_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Ldec4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldec4x_done + + movups 0x10-0x78($key),$rndkey1 + movups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + movdqu (@inptr[0]),@out[0] # load inputs + movdqu (@inptr[1]),@out[1] + pxor $zero,@out[0] + movdqu (@inptr[2]),@out[2] + pxor $zero,@out[1] + movdqu (@inptr[3]),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesdec $rndkey1,@out[0] + prefetcht0 31(@inptr[0],$offset) # prefetch input + prefetcht0 31(@inptr[1],$offset) + aesdec $rndkey1,@out[1] + prefetcht0 31(@inptr[2],$offset) + prefetcht0 31(@inptr[3],$offset) + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesdec $rndkey,@out[0] + aesdec $rndkey,@out[1] + aesdec $rndkey,@out[2] + cmovge $sink,@inptr[$i] # cancel input + cmovg $sink,@outptr[$i] # sink output + aesdec $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesdec $rndkey0,@out[0] + prefetcht0 15(@outptr[0],$offset) # prefetch output + prefetcht0 15(@outptr[1],$offset) + aesdec $rndkey0,@out[1] + prefetcht0 15(@outptr[2],$offset) + prefetcht0 15(@outptr[3],$offset) + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesdec $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesdec $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + jmp .Ldec4x_tail + +.align 32 +.Ldec4x_tail: + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + pxor $rndkey0,@inp[0] + pxor $rndkey0,@inp[1] + aesdec $rndkey1,@out[3] + movdqu 0x10-0x78($key),$rndkey1 + pxor $rndkey0,@inp[2] + pxor $rndkey0,@inp[3] + movdqu 0x20-0x78($key),$rndkey0 + + aesdeclast @inp[0],@out[0] + aesdeclast @inp[1],@out[1] + movdqu -16(@inptr[0],$offset),@inp[0] # load next IV + movdqu -16(@inptr[1],$offset),@inp[1] + aesdeclast @inp[2],@out[2] + aesdeclast @inp[3],@out[3] + movdqu -16(@inptr[2],$offset),@inp[2] + movdqu -16(@inptr[3],$offset),@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + movdqu (@inptr[0],$offset),@out[0] + movups @out[1],-16(@outptr[1],$offset) + movdqu (@inptr[1],$offset),@out[1] + pxor $zero,@out[0] + movups @out[2],-16(@outptr[2],$offset) + movdqu (@inptr[2],$offset),@out[2] + pxor $zero,@out[1] + movups @out[3],-16(@outptr[3],$offset) + movdqu (@inptr[3],$offset),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + + dec $num + jnz .Loop_dec4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + lea `40*4`($inp),$inp + dec $num + jnz .Ldec4x_loop_grande + +.Ldec4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + #movaps -0x68(%rax),%xmm13 + #movaps -0x58(%rax),%xmm14 + #movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Ldec4x_epilogue: + ret +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +___ + + if ($avx) {{{ +my @ptr=map("%r$_",(8..15)); +my $offload=$sink; + +my @out=map("%xmm$_",(2..9)); +my @inp=map("%xmm$_",(10..13)); +my ($counters,$zero)=("%xmm14","%xmm15"); + +$code.=<<___; +.type aesni_multi_cbc_encrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_encrypt_avx: +_avx_cbc_enc_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + + sub \$192,%rsp + and \$-128,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Lenc8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances +___ +} +$code.=<<___; + test $num,$num + jz .Lenc8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + + vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round + lea 128(%rsp),$offload # offload area + vpxor (@ptr[1]),$zero,@inp[1] + vpxor (@ptr[2]),$zero,@inp[2] + vpxor (@ptr[3]),$zero,@inp[3] + vpxor @inp[0],@out[0],@out[0] + vpxor (@ptr[4]),$zero,@inp[0] + vpxor @inp[1],@out[1],@out[1] + vpxor (@ptr[5]),$zero,@inp[1] + vpxor @inp[2],@out[2],@out[2] + vpxor (@ptr[6]),$zero,@inp[2] + vpxor @inp[3],@out[3],@out[3] + vpxor (@ptr[7]),$zero,@inp[3] + vpxor @inp[0],@out[4],@out[4] + mov \$1,$one # constant of 1 + vpxor @inp[1],@out[5],@out[5] + vpxor @inp[2],@out[6],@out[6] + vpxor @inp[3],@out[7],@out[7] + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesenc $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesenc $rndkey,@out[1],@out[1] + prefetcht0 31(@ptr[$i]) # prefetch input + vaesenc $rndkey,@out[2],@out[2] +___ +$code.=<<___ if ($i>1); + prefetcht0 15(@ptr[$i-2]) # prefetch output +___ +$code.=<<___; + vaesenc $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesenc $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesenc $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesenc $rndkey,@out[6],@out[6] + vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round + mov $offset,64+8*$i(%rsp) + vaesenc $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4) + vmovdqu @inp[$i%4],`16*$i`($offload) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + prefetcht0 15(@ptr[$i-2]) # prefetch output + prefetcht0 15(@ptr[$i-1]) + cmp \$11,$rounds + jb .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Lenc8x_tail: + vaesenc $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesenc $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesenclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesenclast $rndkey0,@out[1],@out[1] + vaesenclast $rndkey0,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenclast $rndkey0,@out[3],@out[3] + vaesenclast $rndkey0,@out[4],@out[4] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesenclast $rndkey0,@out[5],@out[5] + vaesenclast $rndkey0,@out[6],@out[6] + vmovdqa $counters,48(%rsp) # update counters + vaesenclast $rndkey0,@out[7],@out[7] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vpxor 0x00($offload),@out[0],@out[0] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vpxor 0x10($offload),@out[1],@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vpxor 0x20($offload),@out[2],@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vpxor 0x30($offload),@out[3],@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vpxor @inp[0],@out[4],@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vpxor @inp[1],@out[5],@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vpxor @inp[2],@out[6],@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vpxor @inp[3],@out[7],@out[7] + + dec $num + jnz .Loop_enc8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Lenc8x_loop_grande + +.Lenc8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lenc8x_epilogue: + ret +.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx + +.type aesni_multi_cbc_decrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_decrypt_avx: +_avx_cbc_dec_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + # +192 IV/input offload + + sub \$256,%rsp + and \$-256,%rsp + sub \$192,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Ldec8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances + vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV +___ +} +$code.=<<___; + test $num,$num + jz .Ldec8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + lea 192+128(%rsp),$offload # offload area + + vmovdqu (@ptr[0]),@out[0] # load inputs + vmovdqu (@ptr[1]),@out[1] + vmovdqu (@ptr[2]),@out[2] + vmovdqu (@ptr[3]),@out[3] + vmovdqu (@ptr[4]),@out[4] + vmovdqu (@ptr[5]),@out[5] + vmovdqu (@ptr[6]),@out[6] + vmovdqu (@ptr[7]),@out[7] + vmovdqu @out[0],0x00($offload) # offload inputs + vpxor $zero,@out[0],@out[0] # xor inputs with 0-round + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @out[4],0x40($offload) + vpxor $zero,@out[4],@out[4] + vmovdqu @out[5],0x50($offload) + vpxor $zero,@out[5],@out[5] + vmovdqu @out[6],0x60($offload) + vpxor $zero,@out[6],@out[6] + vmovdqu @out[7],0x70($offload) + vpxor $zero,@out[7],@out[7] + xor \$0x80,$offload + mov \$1,$one # constant of 1 + jmp .Loop_dec8x + +.align 32 +.Loop_dec8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesdec $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesdec $rndkey,@out[1],@out[1] + prefetcht0 31(@ptr[$i]) # prefetch input + vaesdec $rndkey,@out[2],@out[2] +___ +$code.=<<___ if ($i>1); + prefetcht0 15(@ptr[$i-2]) # prefetch output +___ +$code.=<<___; + vaesdec $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesdec $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesdec $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesdec $rndkey,@out[6],@out[6] + vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input + mov $offset,64+8*$i(%rsp) + vaesdec $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4); + vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + prefetcht0 15(@ptr[$i-2]) # prefetch output + prefetcht0 15(@ptr[$i-1]) + cmp \$11,$rounds + jb .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Ldec8x_tail: + vaesdec $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesdec $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesdeclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesdeclast $rndkey0,@out[1],@out[1] + vpxor 0x00($offload),@out[0],@out[0] # xor with IV + vaesdeclast $rndkey0,@out[2],@out[2] + vpxor 0x10($offload),@out[1],@out[1] + vpcmpgtd $zero,$counters,$zero + vaesdeclast $rndkey0,@out[3],@out[3] + vpxor 0x20($offload),@out[2],@out[2] + vaesdeclast $rndkey0,@out[4],@out[4] + vpxor 0x30($offload),@out[3],@out[3] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesdeclast $rndkey0,@out[5],@out[5] + vpxor 0x40($offload),@out[4],@out[4] + vaesdeclast $rndkey0,@out[6],@out[6] + vpxor 0x50($offload),@out[5],@out[5] + vmovdqa $counters,48(%rsp) # update counters + vaesdeclast $rndkey0,@out[7],@out[7] + vpxor 0x60($offload),@out[6],@out[6] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vmovdqu 128+0(%rsp),@out[0] + vpxor 0x70($offload),@out[7],@out[7] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vmovdqu @out[0],0x00($offload) + vpxor $zero,@out[0],@out[0] + vmovdqu 128+16(%rsp),@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu 128+32(%rsp),@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu 128+48(%rsp),@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @inp[0],0x40($offload) + vpxor @inp[0],$zero,@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vmovdqu @inp[1],0x50($offload) + vpxor @inp[1],$zero,@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vmovdqu @inp[2],0x60($offload) + vpxor @inp[2],$zero,@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vmovdqu @inp[3],0x70($offload) + vpxor @inp[3],$zero,@out[7] + + xor \$128,$offload + dec $num + jnz .Loop_dec8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Ldec8x_loop_grande + +.Ldec8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Ldec8x_epilogue: + ret +.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx +___ + }}} + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov 16(%rax),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + mov %r13,224($context) # restore cotnext->R13 + mov %r14,232($context) # restore cotnext->R14 + mov %r15,240($context) # restore cotnext->R15 + + lea -56-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_multi_cbc_encrypt + .rva .LSEH_end_aesni_multi_cbc_encrypt + .rva .LSEH_info_aesni_multi_cbc_encrypt + .rva .LSEH_begin_aesni_multi_cbc_decrypt + .rva .LSEH_end_aesni_multi_cbc_decrypt + .rva .LSEH_info_aesni_multi_cbc_decrypt +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx + .rva .LSEH_end_aesni_multi_cbc_encrypt_avx + .rva .LSEH_info_aesni_multi_cbc_encrypt_avx + .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx + .rva .LSEH_end_aesni_multi_cbc_decrypt_avx + .rva .LSEH_info_aesni_multi_cbc_decrypt_avx +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_aesni_multi_cbc_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] +.LSEH_info_aesni_multi_cbc_decrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_aesni_multi_cbc_encrypt_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] +.LSEH_info_aesni_multi_cbc_decrypt_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] +___ +} +#################################################################### + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; +close STDOUT; diff --git a/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl b/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl index 3c8f6c19e..97992adca 100644 --- a/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl +++ b/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -21,16 +21,25 @@ # subroutine: # # AES-128-CBC +SHA1 stitch gain -# Westmere 3.77[+5.6] 9.37 6.65 +41% -# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) +# Westmere 3.77[+5.3] 9.07 6.55 +38% +# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) +# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% +# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) +# Bulldozer 5.77[+6.0] 11.72 6.37 +84% # # AES-192-CBC -# Westmere 4.51 10.11 6.97 +45% -# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) +# Westmere 4.51 9.81 6.80 +44% +# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) +# Ivy Bridge 6.05 10.65 6.07 +75% +# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) +# Bulldozer 6.89 12.84 6.96 +84% # # AES-256-CBC -# Westmere 5.25 10.85 7.25 +50% -# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) +# Westmere 5.25 10.55 7.21 +46% +# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) +# Ivy Bridge 7.05 11.65 7.12 +64% +# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) +# Bulldozer 8.00 13.95 8.25 +69% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 @@ -45,8 +54,25 @@ # standalone AESNI-CBC decrypt: # # AES-128-CBC AES-192-CBC AES-256-CBC -# Westmere 1.31 1.55 1.80 -# Sandy Bridge 0.93 1.06 1.22 +# Westmere 1.25 1.50 1.75 +# Sandy Bridge 0.74 0.91 1.09 +# Ivy Bridge 0.74 0.90 1.11 +# Haswell 0.63 0.76 0.88 +# Bulldozer 0.70 0.85 0.99 + +# And indeed: +# +# AES-256-CBC +SHA1 stitch gain +# Westmere 1.75 7.20 6.68 +7.8% +# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) +# Ivy Bridge 1.11 5.70 5.45 +4.6% +# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) +# Bulldozer 0.99 6.95 5.95 +17%(**) +# +# (*) Tiny improvement coefficient on Haswell is because we compare +# AVX1 stitch to sum with AVX2 SHA1. +# (**) Execution is fully dominated by integer code sequence and +# SIMD still hardly shows [in single-process benchmark;-] $flavour = shift; $output = shift; @@ -68,6 +94,11 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); +$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); + +$shaext=1; ### set to zero if compiling for 1.0.1 + +$stitched_decrypt=0; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; @@ -86,11 +117,15 @@ $code.=<<___; .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,\@abi-omnipotent -.align 16 +.align 32 aesni_cbc_sha1_enc: # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d - mov OPENSSL_ia32cap_P+4(%rip),%r11d + mov OPENSSL_ia32cap_P+4(%rip),%r11 +___ +$code.=<<___ if ($shaext); + bt \$61,%r11 # check SHA bit + jc aesni_cbc_sha1_enc_shaext ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit @@ -112,10 +147,21 @@ my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); -my $j=0; my $jj=0; my $r=0; my $sn=0; +my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; my $K_XX_XX="%r11"; -my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); -my @rndkey=("%xmm14","%xmm15"); +my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc +my @rndkey=("%xmm14","%xmm15"); # for enc +my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec + +if (1) { # reassign for Atom Silvermont + # The goal is to minimize amount of instructions with more than + # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* + # SSSE3 instructions to upper half of the register bank. + @X=map("%xmm$_",(8..11,4..7)); + @Tx=map("%xmm$_",(12,13,3)); + ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); + @rndkey=("%xmm0","%xmm1"); +} sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; @@ -129,7 +175,7 @@ my $_ror=sub { &ror(@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_ssse3,\@function,6 -.align 16 +.align 32 aesni_cbc_sha1_enc_ssse3: mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact @@ -161,16 +207,16 @@ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 - mov $key,%r15 + lea 112($key),%r15 # size optimization movdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out - mov 240($key),$rounds + mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX @@ -180,19 +226,22 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] - movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] - pshufb @X[2],@X[-4&7] # byte swap + pshufb @Tx[2],@X[-4&7] # byte swap + pshufb @Tx[2],@X[-3&7] + pshufb @Tx[2],@X[-2&7] add \$64,$inp - pshufb @X[2],@X[-3&7] - pshufb @X[2],@X[-2&7] - pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-4&7] # add K_00_19 + pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU @@ -201,8 +250,8 @@ $code.=<<___; psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] - movups ($key),$rndkey0 # $key[0] - movups 16($key),$rndkey[0] # forward reference + movups -112($key),$rndkey0 # $key[0] + movups 16-112($key),$rndkey[0] # forward reference jmp .Loop_ssse3 ___ @@ -219,31 +268,31 @@ ___ ___ $code.=<<___; xorps $in,$iv + movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn - movups `32+16*($k+0)`($key),$rndkey[1] + movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*($k+1)`($key),$rndkey[0] + movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn - movups `32+16*($k+2)`($key),$rndkey[1] + movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*($k+3)`($key),$rndkey[0] + movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv - movups 16($key),$rndkey[1] # forward reference + movups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; + movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv - movups `32+16*$k`($key),$rndkey[1] ___ } $r++; unshift(@rndkey,pop(@rndkey)); @@ -255,74 +304,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); - &movdqa (@X[0],@X[-3&7]); - eval(shift(@insns)); + eval(shift(@insns)); # ror + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); - eval(shift(@insns)); - + eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); - &movdqa (@Tx[0],@X[0]); - eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); - &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); - eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] @@ -333,27 +383,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 sub Xupdate_ssse3_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); - &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)) if ($Xi==8); + eval(shift(@insns)); # body_20_39 eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_ror/); + eval(shift(@insns)) if (@insns[0] =~ /_ror/); + &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); } - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" @@ -361,29 +414,31 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol + eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &psrld (@Tx[0],30); eval(shift(@insns)); - eval(shift(@insns)); # rol + &psrld (@Tx[0],30); + eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror - eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -404,9 +459,10 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); @@ -415,17 +471,17 @@ sub Xuplast_ssse3_80() foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); - &je (".Ldone_ssse3"); + &je (shift); unshift(@Tx,pop(@Tx)); - &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); - &pshufb (@X[-4&7],@X[2]); # byte swap + &pshufb (@X[-4&7],@Tx[2]); # byte swap &add ($inp,64); $Xi=0; @@ -439,7 +495,10 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); - &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); @@ -450,6 +509,8 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } @@ -465,76 +526,106 @@ sub Xtail_ssse3() foreach (@insns) { eval; } } -sub body_00_19 () { - use integer; - my ($k,$n); - my @r=( +my @body_00_19 = ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer - '&xor ($c,$d);', - '&mov (@T[1],$a);', # $b in next round - '&$_rol ($a,5);', - '&and (@T[0],$c);', # ($b&($c^$d)) - '&xor ($c,$d);', # restore $c - '&xor (@T[0],$d);', - '&add ($e,$a);', '&$_ror ($b,$j?7:2);', # $b>>>2 - '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&xor (@T[0],$d);', + '&mov (@T[1],$a);', # $b for next round + + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&xor ($b,$c);', # $c^$d for next round + + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&and (@T[1],$b);', # ($b&($c^$d)) for next round + + '&xor ($b,$c);', # restore $b + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39() if ($rx==19); $rx++; + + use integer; + my ($k,$n); + my @r=@body_00_19; + $n = scalar(@r); $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); $jj++; + return @r; } -sub body_20_39 () { - use integer; - my ($k,$n); - my @r=( +my @body_20_39 = ( '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&xor (@T[0],$d);', # ($b^$d) - '&mov (@T[1],$a);', # $b in next round + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&xor (@T[0],$d) if($j==19);'. + '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) + '&mov (@T[1],$a);', # $b for next round + '&$_rol ($a,5);', - '&xor (@T[0],$c);', # ($b^$d^$c) - '&add ($e,$a);', + '&add ($e,@T[0]);', + '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round + '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_20_39 () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59() if ($rx==39); $rx++; + + use integer; + my ($k,$n); + my @r=@body_20_39; + $n = scalar(@r); $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); $jj++; + return @r; } -sub body_40_59 () { - use integer; - my ($k,$n); - my @r=( +my @body_40_59 = ( '($a,$b,$c,$d,$e)=@V;'. - '&mov (@T[1],$c);', - '&xor ($c,$d);', - '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&and (@T[1],$d);', - '&and (@T[0],$c);', # ($b&($c^$d)) + '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer + '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) + '&xor ($c,$d) if ($j>=40);', # restore $c + '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[1]);', - '&mov (@T[1],$a);', # $b in next round + '&mov (@T[1],$a);', # $b for next round + '&xor (@T[0],$c);', + '&$_rol ($a,5);', '&add ($e,@T[0]);', - '&xor ($c,$d);', # restore $c - '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + '&xor (@T[1],$c) if ($j==59);'. + '&xor (@T[1],$b) if ($j< 59);', # b^c for next round + + '&xor ($b,$c) if ($j< 59);', # c^d for next round + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); + +sub body_40_59 () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + $rx++; + + use integer; + my ($k,$n); + my @r=@body_40_59; + $n = scalar(@r); $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds - @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); $jj++; + return @r; } $code.=<<___; -.align 16 +.align 32 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); @@ -553,7 +644,7 @@ ___ &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); - &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; @@ -575,11 +666,13 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) + mov $C,@T[1] mov $D,12($ctx) + xor $D,@T[1] mov $E,16($ctx) + and @T[1],@T[0] jmp .Loop_ssse3 -.align 16 .Ldone_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; @@ -631,7 +724,278 @@ $code.=<<___; .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 ___ -$j=$jj=$r=$sn=0; + if ($stitched_decrypt) {{{ +# reset +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); +$j=$jj=$r=$rx=0; +$Xi=4; + +# reassign for Atom Silvermont (see above) +($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); +@X=map("%xmm$_",(8..13,6,7)); +@Tx=map("%xmm$_",(14,15,5)); + +my @aes256_dec = ( + '&movdqu($inout0,"0x00($in0)");', + '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', + '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', + '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', + + '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', + '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] + undef,undef + ); +for ($i=0;$i<13;$i++) { + push (@aes256_dec,( + '&aesdec ($inout0,$rndkey0);', + '&aesdec ($inout1,$rndkey0);', + '&aesdec ($inout2,$rndkey0);', + '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' + )); + push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); + push (@aes256_dec,(undef,undef)) if ($i==5); +} +push(@aes256_dec,( + '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', + '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', + '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', + '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', + + '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', + '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', + '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', + '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', + + '&movups ("0x30($out,$in0)",$inout3);' + )); + +sub body_00_19_dec () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39_dec() if ($rx==19); + + my @r=@body_00_19; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +sub body_20_39_dec () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59_dec() if ($rx==39); + + my @r=@body_20_39; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +sub body_40_59_dec () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + + my @r=@body_40_59; + + unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); + $rx++; + + return @r; +} + +$code.=<<___; +.globl aesni256_cbc_sha1_dec +.type aesni256_cbc_sha1_dec,\@abi-omnipotent +.align 32 +aesni256_cbc_sha1_dec: + # caller should check for SSSE3 and AES-NI bits + mov OPENSSL_ia32cap_P+0(%rip),%r10d + mov OPENSSL_ia32cap_P+4(%rip),%r11d +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r11d # mask AVX bit + and \$`1<<30`,%r10d # mask "Intel CPU" bit + or %r11d,%r10d + cmp \$`1<<28|1<<30`,%r10d + je aesni256_cbc_sha1_dec_avx +___ +$code.=<<___; + jmp aesni256_cbc_sha1_dec_ssse3 + ret +.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec + +.type aesni256_cbc_sha1_dec_ssse3,\@function,6 +.align 32 +aesni256_cbc_sha1_dec_ssse3: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_dec_ssse3: +___ +$code.=<<___; + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + lea 112($key),%r15 # size optimization + movdqu ($ivp),@X[3] # load IV + #mov $ivp,88(%rsp) # save $ivp +___ +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +$code.=<<___; + shl \$6,$len + sub $in0,$out + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + movdqa 64($K_XX_XX),@Tx[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @Tx[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @Tx[2],@X[-3&7] + pshufb @Tx[2],@X[-2&7] + pshufb @Tx[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + movdqu -112($key),$rndkey0 # $key[0] + jmp .Loop_dec_ssse3 + +.align 32 +.Loop_dec_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_16_31(\&body_00_19_dec); + &Xupdate_ssse3_32_79(\&body_00_19_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_40_59_dec); + &Xupdate_ssse3_32_79(\&body_20_39_dec); + &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_rx=$rx; + + &Xloop_ssse3(\&body_20_39_dec); + &Xloop_ssse3(\&body_20_39_dec); + &Xloop_ssse3(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_dec_ssse3 + +.Ldone_dec_ssse3: +___ + $jj=$j=$saved_j; @V=@saved_V; + $rx=$saved_rx; + + &Xtail_ssse3(\&body_20_39_dec); + &Xtail_ssse3(\&body_20_39_dec); + &Xtail_ssse3(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + movups @X[3],($ivp) # write IV +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_dec_ssse3: + ret +.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 +___ + }}} +$j=$jj=$r=$rx=0; if ($avx) { my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); @@ -641,13 +1005,17 @@ my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); +my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); +my @rndkey=("%xmm14","%xmm15"); +my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec +my $Kx=@Tx[2]; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_avx,\@function,6 -.align 16 +.align 32 aesni_cbc_sha1_enc_avx: mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact @@ -680,17 +1048,16 @@ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 - mov $key,%r15 + lea 112($key),%r15 # size optimization vmovdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ -my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out - mov 240($key),$rounds - add \$112,$key # size optimization + mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX @@ -700,9 +1067,12 @@ $code.=<<___; mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask - vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] @@ -712,13 +1082,13 @@ $code.=<<___; vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] - vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 - vpaddd @Tx[1],@X[-3&7],@X[1] - vpaddd @Tx[1],@X[-2&7],@X[2] + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) - vmovups -112($key),$rndkey0 # $key[0] + vmovups -112($key),$rndkey[1] # $key[0] vmovups 16-112($key),$rndkey[0] # forward reference jmp .Loop_avx ___ @@ -728,14 +1098,14 @@ my $aesenc=sub { my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; - vmovups `16*$n`($in0),$in # load input - vxorps $rndkey0,$in,$in + vmovdqu `16*$n`($in0),$in # load input + vpxor $rndkey[1],$in,$in ___ $code.=<<___ if ($n); vmovups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; - vxorps $in,$iv,$iv + vpxor $in,$iv,$iv vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ @@ -755,6 +1125,7 @@ ___ vmovups `32+16*($k+3)-112`($key),$rndkey[0] .Lvaesenclast$sn: vaesenclast $rndkey[0],$iv,$iv + vmovups -112($key),$rndkey[0] vmovups 16-112($key),$rndkey[1] # forward reference ___ } else { @@ -778,10 +1149,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" @@ -807,31 +1178,31 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 eval(shift(@insns)); eval(shift(@insns)); - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + &vpsrld (@Tx[0],@Tx[1],30); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); + &vpslld (@Tx[1],@Tx[1],2); + &vpxor (@X[0],@X[0],@Tx[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); @@ -839,7 +1210,6 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); } sub Xupdate_avx_32_79() @@ -858,12 +1228,8 @@ sub Xupdate_avx_32_79() &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - if ($Xi%5) { - &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... - } else { # ... or load next one - &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); - } - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); + &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); @@ -893,7 +1259,6 @@ sub Xupdate_avx_32_79() &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -904,7 +1269,6 @@ sub Xupdate_avx_32_79() foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); } sub Xuplast_avx_80() @@ -914,28 +1278,26 @@ sub Xuplast_avx_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); - &je (".Ldone_avx"); + &je (shift); - unshift(@Tx,pop(@Tx)); - - &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask - &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask + &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap &add ($inp,64); $Xi=0; @@ -949,15 +1311,15 @@ sub Xloop_avx() eval(shift(@insns)); eval(shift(@insns)); - &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); @@ -975,7 +1337,7 @@ sub Xtail_avx() } $code.=<<___; -.align 16 +.align 32 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); @@ -994,7 +1356,7 @@ ___ &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); - &Xuplast_avx_80(\&body_20_39); # can jump to "done" + &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; @@ -1016,11 +1378,13 @@ $code.=<<___; mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) + mov $C,@T[1] mov $D,12($ctx) + xor $D,@T[1] mov $E,16($ctx) + and @T[1],@T[0] jmp .Loop_avx -.align 16 .Ldone_avx: ___ $jj=$j=$saved_j; @V=@saved_V; @@ -1072,6 +1436,218 @@ $code.=<<___; ret .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx ___ + + if ($stitched_decrypt) {{{ +# reset +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$j=$jj=$r=$rx=0; +$Xi=4; + +@aes256_dec = ( + '&vpxor ($inout0,$rndkey0,"0x00($in0)");', + '&vpxor ($inout1,$rndkey0,"0x10($in0)");', + '&vpxor ($inout2,$rndkey0,"0x20($in0)");', + '&vpxor ($inout3,$rndkey0,"0x30($in0)");', + + '&vmovups($rndkey0,"16-112($key)");', + '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] + undef,undef + ); +for ($i=0;$i<13;$i++) { + push (@aes256_dec,( + '&vaesdec ($inout0,$inout0,$rndkey0);', + '&vaesdec ($inout1,$inout1,$rndkey0);', + '&vaesdec ($inout2,$inout2,$rndkey0);', + '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' + )); + push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); + push (@aes256_dec,(undef,undef)) if ($i==5); +} +push(@aes256_dec,( + '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', + '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', + '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', + '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', + + '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', + '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', + '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', + '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', + + '&vmovups ("0x30($out,$in0)",$inout3);' + )); + +$code.=<<___; +.type aesni256_cbc_sha1_dec_avx,\@function,6 +.align 32 +aesni256_cbc_sha1_dec_avx: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_dec_avx: +___ +$code.=<<___; + vzeroall + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + lea 112($key),%r15 # size optimization + vmovdqu ($ivp),@X[3] # load IV +___ +($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +$code.=<<___; + shl \$6,$len + sub $in0,$out + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),$Kx # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vpaddd $Kx,@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + vmovups -112($key),$rndkey0 # $key[0] + jmp .Loop_dec_avx + +.align 32 +.Loop_dec_avx: +___ + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_16_31(\&body_00_19_dec); + &Xupdate_avx_32_79(\&body_00_19_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_40_59_dec); + &Xupdate_avx_32_79(\&body_20_39_dec); + &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_rx=$rx; + + &Xloop_avx(\&body_20_39_dec); + &Xloop_avx(\&body_20_39_dec); + &Xloop_avx(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_dec_avx + +.Ldone_dec_avx: +___ + $jj=$j=$saved_j; @V=@saved_V; + $rx=$saved_rx; + + &Xtail_avx(\&body_20_39_dec); + &Xtail_avx(\&body_20_39_dec); + &Xtail_avx(\&body_20_39_dec); + + eval(@aes256_dec[-1]); # last store +$code.=<<___; + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + vmovups @X[3],($ivp) # write IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_dec_avx: + ret +.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx +___ + }}} } $code.=<<___; .align 64 @@ -1081,11 +1657,180 @@ K_XX_XX: .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ + if ($shaext) {{{ +($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$rounds="%r11d"; + +($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); +@rndkey=("%xmm0","%xmm1"); +$r=0; + +my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.type aesni_cbc_sha1_enc_shaext,\@function,6 +.align 32 +aesni_cbc_sha1_enc_shaext: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument +___ +$code.=<<___ if ($win64); + lea `-8-10*16`(%rsp),%rsp + movaps %xmm6,-8-10*16(%rax) + movaps %xmm7,-8-9*16(%rax) + movaps %xmm8,-8-8*16(%rax) + movaps %xmm9,-8-7*16(%rax) + movaps %xmm10,-8-6*16(%rax) + movaps %xmm11,-8-5*16(%rax) + movaps %xmm12,-8-4*16(%rax) + movaps %xmm13,-8-3*16(%rax) + movaps %xmm14,-8-2*16(%rax) + movaps %xmm15,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + movdqu ($ctx),$ABCD + movd 16($ctx),$E + movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap + + mov 240($key),$rounds + sub $in0,$out + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + lea 112($key),$key # size optimization + + pshufd \$0b00011011,$ABCD,$ABCD # flip word order + pshufd \$0b00011011,$E,$E # flip word order + jmp .Loop_shaext +.align 16 +.Loop_shaext: +___ + &$aesenc(); +$code.=<<___; + movdqu ($inp),@MSG[0] + movdqa $E,$E_SAVE # offload $E + pshufb $BSWAP,@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqa $ABCD,$ABCD_SAVE # offload $ABCD +___ + &$aesenc(); +$code.=<<___; + pshufb $BSWAP,@MSG[1] + + paddd @MSG[0],$E + movdqu 0x20($inp),@MSG[2] + lea 0x40($inp),$inp + pxor $E_SAVE,@MSG[0] # black magic +___ + &$aesenc(); +$code.=<<___; + pxor $E_SAVE,@MSG[0] # black magic + movdqa $ABCD,$E_ + pshufb $BSWAP,@MSG[2] + sha1rnds4 \$0,$E,$ABCD # 0-3 + sha1nexte @MSG[1],$E_ +___ + &$aesenc(); +$code.=<<___; + sha1msg1 @MSG[1],@MSG[0] + movdqu -0x10($inp),@MSG[3] + movdqa $ABCD,$E + pshufb $BSWAP,@MSG[3] +___ + &$aesenc(); +$code.=<<___; + sha1rnds4 \$0,$E_,$ABCD # 4-7 + sha1nexte @MSG[2],$E + pxor @MSG[2],@MSG[0] + sha1msg1 @MSG[2],@MSG[1] +___ + &$aesenc(); + +for($i=2;$i<20-4;$i++) { +$code.=<<___; + movdqa $ABCD,$E_ + sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 + sha1nexte @MSG[3],$E_ +___ + &$aesenc(); +$code.=<<___; + sha1msg2 @MSG[3],@MSG[0] + pxor @MSG[3],@MSG[1] + sha1msg1 @MSG[3],@MSG[2] +___ + ($E,$E_)=($E_,$E); + push(@MSG,shift(@MSG)); + + &$aesenc(); +} +$code.=<<___; + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 64-67 + sha1nexte @MSG[3],$E_ + sha1msg2 @MSG[3],@MSG[0] + pxor @MSG[3],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 68-71 + sha1nexte @MSG[0],$E + sha1msg2 @MSG[0],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + movdqa $E_SAVE,@MSG[0] + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 72-75 + sha1nexte @MSG[1],$E_ +___ + &$aesenc(); +$code.=<<___; + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 76-79 + sha1nexte $MSG[0],$E +___ + while($r<40) { &$aesenc(); } # remaining aesenc's +$code.=<<___; + dec $len + + paddd $ABCD_SAVE,$ABCD + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + jnz .Loop_shaext + + pshufd \$0b00011011,$ABCD,$ABCD + pshufd \$0b00011011,$E,$E + movups $iv,($ivp) # write IV + movdqu $ABCD,($ctx) + movd $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-10*16(%rax),%xmm6 + movaps -8-9*16(%rax),%xmm7 + movaps -8-8*16(%rax),%xmm8 + movaps -8-7*16(%rax),%xmm9 + movaps -8-6*16(%rax),%xmm10 + movaps -8-5*16(%rax),%xmm11 + movaps -8-4*16(%rax),%xmm12 + movaps -8-3*16(%rax),%xmm13 + movaps -8-2*16(%rax),%xmm14 + movaps -8-1*16(%rax),%xmm15 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext +___ + }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -1127,7 +1872,21 @@ ssse3_handler: lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail +___ +$code.=<<___ if ($shaext); + lea aesni_cbc_sha1_enc_shaext(%rip),%r10 + cmp %r10,%rbx + jb .Lseh_no_shaext + lea (%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea 168(%rax),%rax # adjust stack pointer + jmp .Lcommon_seh_tail +.Lseh_no_shaext: +___ +$code.=<<___; lea 96(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx @@ -1199,6 +1958,11 @@ $code.=<<___ if ($avx); .rva .LSEH_end_aesni_cbc_sha1_enc_avx .rva .LSEH_info_aesni_cbc_sha1_enc_avx ___ +$code.=<<___ if ($shaext); + .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext + .rva .LSEH_end_aesni_cbc_sha1_enc_shaext + .rva .LSEH_info_aesni_cbc_sha1_enc_shaext +___ $code.=<<___; .section .xdata .align 8 @@ -1213,6 +1977,12 @@ $code.=<<___ if ($avx); .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ +$code.=<<___ if ($shaext); +.LSEH_info_aesni_cbc_sha1_enc_shaext: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] +___ } #################################################################### @@ -1223,28 +1993,65 @@ sub rex { $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); - push @opcode,$rex|0x40 if($rex); + unshift @opcode,$rex|0x40 if($rex); +} + +sub sha1rnds4 { + if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x3a,0xcc); + rex(\@opcode,$3,$2); + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } else { + return "sha1rnds4\t".@_[0]; + } +} + +sub sha1op38 { + my $instr = shift; + my %opcodelet = ( + "sha1nexte" => 0xc8, + "sha1msg1" => 0xc9, + "sha1msg2" => 0xca ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } } sub aesni { my $line=shift; - my @opcode=(0x66); + my @opcode=(0x0f,0x38); if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( - "aesenc" => 0xdc, "aesenclast" => 0xdd + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); - push @opcode,0x0f,0x38,$opcodelet{$1}; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M + unshift @opcode,0x66; return ".byte\t".join(',',@opcode); } return $line; } -$code =~ s/\`([^\`]*)\`/eval($1)/gem; -$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or + s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or + s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; -print $code; + print $_,"\n"; +} close STDOUT; diff --git a/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl b/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl new file mode 100755 index 000000000..c1fce8983 --- /dev/null +++ b/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -0,0 +1,1708 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# January 2013 +# +# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled +# in http://download.intel.com/design/intarch/papers/323686.pdf, is +# that since AESNI-CBC encrypt exhibit *very* low instruction-level +# parallelism, interleaving it with another algorithm would allow to +# utilize processor resources better and achieve better performance. +# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and +# AESNI code is weaved into it. As SHA256 dominates execution time, +# stitch performance does not depend on AES key length. Below are +# performance numbers in cycles per processed byte, less is better, +# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched +# subroutine: +# +# AES-128/-192/-256+SHA256 this(**)gain +# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% +# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% +# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% +# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# +# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that +# Westmere is omitted from loop, this is because gain was not +# estimated high enough to justify the effort; +# (**) these are EVP-free results, results obtained with 'speed +# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=$avx; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$func="aesni_cbc_sha256_enc"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +######################################################################## +# void aesni_cbc_sha256_enc(const void *inp, +# void *out, +# size_t length, +# const AES_KEY *key, +# unsigned char *iv, +# SHA256_CTX *ctx, +# const void *in0); +($inp, $out, $len, $key, $ivp, $ctx, $in0) = +("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +$Tbl="%rbp"; + +$_inp="16*$SZ+0*8(%rsp)"; +$_out="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$_key="16*$SZ+3*8(%rsp)"; +$_ivp="16*$SZ+4*8(%rsp)"; +$_ctx="16*$SZ+5*8(%rsp)"; +$_in0="16*$SZ+6*8(%rsp)"; +$_rsp="16*$SZ+7*8(%rsp)"; +$framesz=16*$SZ+8*8; + +$code=<<___; +.text + +.extern OPENSSL_ia32cap_P +.globl $func +.type $func,\@abi-omnipotent +.align 16 +$func: +___ + if ($avx) { +$code.=<<___; + lea OPENSSL_ia32cap_P(%rip),%r11 + mov \$1,%eax + cmp \$0,`$win64?"%rcx":"%rdi"` + je .Lprobe + mov 0(%r11),%eax + mov 4(%r11),%r10 +___ +$code.=<<___ if ($shaext); + bt \$61,%r10 # check for SHA + jc ${func}_shaext +___ +$code.=<<___; + mov %r10,%r11 + shr \$32,%r11 + + test \$`1<<11`,%r10d # check for XOP + jnz ${func}_xop +___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je ${func}_avx2 +___ +$code.=<<___; + and \$`1<<30`,%eax # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits + or %eax,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d + je ${func}_avx + ud2 +___ + } +$code.=<<___; + xor %eax,%eax + cmp \$0,`$win64?"%rcx":"%rdi"` + je .Lprobe + ud2 +.Lprobe: + ret +.size $func,.-$func + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 + .long 0,0,0,0, 0,0,0,0 + .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by " +.align 64 +___ + +###################################################################### +# SIMD code paths +# +{{{ +($iv,$inout,$roundkey,$temp, + $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); + +$aesni_cbc_idx=0; +@aesni_cbc_block = ( +## &vmovdqu ($roundkey,"0x00-0x80($inp)");' +## &vmovdqu ($inout,($inp)); +## &mov ($_inp,$inp); + + '&vpxor ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', + + '&vpxor ($inout,$inout,$iv);', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', + + '&vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', + + '&vaesenclast ($temp,$inout,$roundkey);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', + + '&vpand ($iv,$temp,$mask10);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', + + '&vaesenclast ($temp,$inout,$roundkey);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', + + '&vpand ($temp,$temp,$mask12);'. + ' &vaesenc ($inout,$inout,$roundkey);'. + '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', + + '&vpor ($iv,$iv,$temp);'. + ' &vaesenclast ($temp,$inout,$roundkey);'. + ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' + +## &mov ($inp,$_inp); +## &mov ($out,$_out); +## &vpand ($temp,$temp,$mask14); +## &vpor ($iv,$iv,$temp); +## &vmovdqu ($iv,($out,$inp); +## &lea (inp,16($inp)); +); + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&xor ($a0,$e)', + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + @aesni_cbc_block[$aesni_cbc_idx++]. + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&xor ($a2,$b)', # a^b, b^c in next round + + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&add ($d,$h)', # d+=h + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +if ($avx) {{ +###################################################################### +# XOP code path +# +$code.=<<___; +.type ${func}_xop,\@function,6 +.align 64 +${func}_xop: +.Lxop_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`$framesz+$win64*16*10`,%rsp + and \$-64,%rsp # align stack frame + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + mov $out,$_out + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_xop: + vzeroall + + mov $inp,%r12 # borrow $a4 + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + sub \$9,%r14 + + mov $SZ*0(%r15),$A + mov $SZ*1(%r15),$B + mov $SZ*2(%r15),$C + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + + vmovdqa 0x00(%r13,%r14,8),$mask14 + vmovdqa 0x10(%r13,%r14,8),$mask12 + vmovdqa 0x20(%r13,%r14,8),$mask10 + vmovdqu 0x00-0x80($inp),$roundkey + jmp .Lloop_xop +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +$code.=<<___; +.align 16 +.Lloop_xop: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00(%rsi,%r12),@X[0] + vmovdqu 0x10(%rsi,%r12),@X[1] + vmovdqu 0x20(%rsi,%r12),@X[2] + vmovdqu 0x30(%rsi,%r12),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + sub \$-16*2*$SZ,$Tbl # size optimization + vmovdqu (%r12),$inout # $a4 + mov %r12,$_inp # $a4 +___ +sub XOP_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t0,$t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[3],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq ($t3,$t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpslldq ($t3,$t3,8); # 22 instructions + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &XOP_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &mov ("%r12",$_inp); # borrow $a4 + &vpand ($temp,$temp,$mask14); + &mov ("%r15",$_out); # borrow $a2 + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r12)",$iv); # write output + &lea ("%r12","16(%r12)"); # inp++ + + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lxop_00_47"); + + &vmovdqu ($inout,"(%r12)"); + &mov ($_inp,"%r12"); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + } +$code.=<<___; + mov $_inp,%r12 # borrow $a4 + mov $_out,%r13 # borrow $a0 + mov $_ctx,%r15 # borrow $a2 + mov $_in0,%rsi # borrow $a3 + + vpand $mask14,$temp,$temp + mov $a1,$A + vpor $temp,$iv,$iv + vmovdqu $iv,(%r13,%r12) # write output + lea 16(%r12),%r12 # inp++ + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + cmp $_end,%r12 + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + jb .Lloop_xop + + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_xop: + ret +.size ${func}_xop,.-${func}_xop +___ +###################################################################### +# AVX+shrd code path +# +local *ror = sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type ${func}_avx,\@function,6 +.align 64 +${func}_avx: +.Lavx_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`$framesz+$win64*16*10`,%rsp + and \$-64,%rsp # align stack frame + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + mov $out,$_out + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_avx: + vzeroall + + mov $inp,%r12 # borrow $a4 + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + sub \$9,%r14 + + mov $SZ*0(%r15),$A + mov $SZ*1(%r15),$B + mov $SZ*2(%r15),$C + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + + vmovdqa 0x00(%r13,%r14,8),$mask14 + vmovdqa 0x10(%r13,%r14,8),$mask12 + vmovdqa 0x20(%r13,%r14,8),$mask10 + vmovdqu 0x00-0x80($inp),$roundkey +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +$code.=<<___; + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00(%rsi,%r12),@X[0] + vmovdqu 0x10(%rsi,%r12),@X[1] + vmovdqu 0x20(%rsi,%r12),@X[2] + vmovdqu 0x30(%rsi,%r12),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + sub \$-16*2*$SZ,$Tbl # size optimization + vmovdqu (%r12),$inout # $a4 + mov %r12,$_inp # $a4 +___ +sub Xupdate_256_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] + '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] + '&vpsrld ($t2,$t0,$sigma0[0]);', + '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] + '&vpsrld ($t3,$t0,$sigma0[2])', + '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrld ($t2,$t3,$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) + '&vpsrlq ($t3,$t3,$sigma1[0]);', + '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) + '&vpshufd ($t2,$t2,0b10000100)', + '&vpsrldq ($t2,$t2,8)', + '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) + '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&vpsrld ($t2,$t3,$sigma1[2])', + '&vpsrlq ($t3,$t3,$sigma1[0])', + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufd ($t2,$t2,0b11101000)', + '&vpslldq ($t2,$t2,8)', + '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub AVX_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &AVX_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &mov ("%r12",$_inp); # borrow $a4 + &vpand ($temp,$temp,$mask14); + &mov ("%r15",$_out); # borrow $a2 + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r12)",$iv); # write output + &lea ("%r12","16(%r12)"); # inp++ + + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lavx_00_47"); + + &vmovdqu ($inout,"(%r12)"); + &mov ($_inp,"%r12"); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + } +$code.=<<___; + mov $_inp,%r12 # borrow $a4 + mov $_out,%r13 # borrow $a0 + mov $_ctx,%r15 # borrow $a2 + mov $_in0,%rsi # borrow $a3 + + vpand $mask14,$temp,$temp + mov $a1,$A + vpor $temp,$iv,$iv + vmovdqu $iv,(%r13,%r12) # write output + lea 16(%r12),%r12 # inp++ + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + cmp $_end,%r12 + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + jb .Lloop_avx + + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx: + ret +.size ${func}_avx,.-${func}_avx +___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + @aesni_cbc_block[$aesni_cbc_idx++]. + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,6 +.align 64 +${func}_avx2: +.Lavx2_shortcut: + mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp + and \$-256*$SZ,%rsp # align stack frame + add \$`2*$SZ*($rounds-8)`,%rsp + + shl \$6,$len + sub $inp,$out # re-bias + sub $inp,$in0 + add $inp,$len # end of input + + #mov $inp,$_inp # saved later + #mov $out,$_out # kept in $offload + mov $len,$_end + #mov $key,$_key # remains resident in $inp register + mov $ivp,$_ivp + mov $ctx,$_ctx + mov $in0,$_in0 + mov %r11,$_rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,`$framesz+16*0`(%rsp) + movaps %xmm7,`$framesz+16*1`(%rsp) + movaps %xmm8,`$framesz+16*2`(%rsp) + movaps %xmm9,`$framesz+16*3`(%rsp) + movaps %xmm10,`$framesz+16*4`(%rsp) + movaps %xmm11,`$framesz+16*5`(%rsp) + movaps %xmm12,`$framesz+16*6`(%rsp) + movaps %xmm13,`$framesz+16*7`(%rsp) + movaps %xmm14,`$framesz+16*8`(%rsp) + movaps %xmm15,`$framesz+16*9`(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + vzeroall + + mov $inp,%r13 # borrow $a0 + vpinsrq \$1,$out,$offload,$offload + lea 0x80($key),$inp # size optimization, reassign + lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 + mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 + mov $ctx,%r15 # borrow $a2 + mov $in0,%rsi # borrow $a3 + vmovdqu ($ivp),$iv # load IV + lea -9(%r14),%r14 + + vmovdqa 0x00(%r12,%r14,8),$mask14 + vmovdqa 0x10(%r12,%r14,8),$mask12 + vmovdqa 0x20(%r12,%r14,8),$mask10 + + sub \$-16*$SZ,%r13 # inp++, size optimization + mov $SZ*0(%r15),$A + lea (%rsi,%r13),%r12 # borrow $a0 + mov $SZ*1(%r15),$B + cmp $len,%r13 # $_end + mov $SZ*2(%r15),$C + cmove %rsp,%r12 # next block or random data + mov $SZ*3(%r15),$D + mov $SZ*4(%r15),$E + mov $SZ*5(%r15),$F + mov $SZ*6(%r15),$G + mov $SZ*7(%r15),$H + vmovdqu 0x00-0x80($inp),$roundkey +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 + vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 + vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 + vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 + + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t3,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t3,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[2],@X[2] + lea -16*$SZ(%r13),%r13 + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + lea -$PUSH8(%rsp),%rsp + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + vmovdqu (%r13),$inout + vpinsrq \$0,%r13,$offload,$offload +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + $aesni_cbc_idx=0; + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &vmovq ("%r13",$offload); # borrow $a0 + &vpextrq ("%r15",$offload,1); # borrow $a2 + &vpand ($temp,$temp,$mask14); + &vpor ($iv,$iv,$temp); + &vmovdqu ("(%r15,%r13)",$iv); # write output + &lea ("%r13","16(%r13)"); # inp++ + + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + &vmovdqu ($inout,"(%r13)"); + &vpinsrq ($offload,$offload,"%r13",0); + + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } +$code.=<<___; + vpextrq \$1,$offload,%r12 # $_out, borrow $a4 + vmovq $offload,%r13 # $_inp, borrow $a0 + mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 + add $a1,$A + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + vpand $mask14,$temp,$temp + vpor $temp,$iv,$iv + vmovdqu $iv,(%r12,%r13) # write output + lea 16(%r13),%r13 + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + add $SZ*7(%r15),$H + + mov $A,$SZ*0(%r15) + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + cmp `$PUSH8+2*8`($Tbl),%r13 # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + mov $F,$a4 + xor $C,$a3 # magic + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + vmovdqu (%r13),$inout + vpinsrq \$0,%r13,$offload,$offload +___ + $aesni_cbc_idx=0; + for ($i=0; $i<16; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); + } +$code.=<<___; + vmovq $offload,%r13 # borrow $a0 + vpextrq \$1,$offload,%r15 # borrow $a2 + vpand $mask14,$temp,$temp + vpor $temp,$iv,$iv + lea -$PUSH8($Tbl),$Tbl + vmovdqu $iv,(%r15,%r13) # write output + lea 16(%r13),%r13 # inp++ + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 + lea 16*$SZ(%r13),%r13 + mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 + add $a1,$A + lea `2*$SZ*($rounds-8)`(%rsp),%rsp + + add $SZ*0(%r15),$A + add $SZ*1(%r15),$B + add $SZ*2(%r15),$C + add $SZ*3(%r15),$D + add $SZ*4(%r15),$E + add $SZ*5(%r15),$F + add $SZ*6(%r15),$G + lea (%rsi,%r13),%r12 + add $SZ*7(%r15),$H + + cmp $_end,%r13 + + mov $A,$SZ*0(%r15) + cmove %rsp,%r12 # next block or stale data + mov $B,$SZ*1(%r15) + mov $C,$SZ*2(%r15) + mov $D,$SZ*3(%r15) + mov $E,$SZ*4(%r15) + mov $F,$SZ*5(%r15) + mov $G,$SZ*6(%r15) + mov $H,$SZ*7(%r15) + + jbe .Loop_avx2 + lea (%rsp),$Tbl + +.Ldone_avx2: + lea ($Tbl),%rsp + mov $_ivp,$ivp + mov $_rsp,%rsi + vmovdqu $iv,($ivp) # output IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps `$framesz+16*0`(%rsp),%xmm6 + movaps `$framesz+16*1`(%rsp),%xmm7 + movaps `$framesz+16*2`(%rsp),%xmm8 + movaps `$framesz+16*3`(%rsp),%xmm9 + movaps `$framesz+16*4`(%rsp),%xmm10 + movaps `$framesz+16*5`(%rsp),%xmm11 + movaps `$framesz+16*6`(%rsp),%xmm12 + movaps `$framesz+16*7`(%rsp),%xmm13 + movaps `$framesz+16*8`(%rsp),%xmm14 + movaps `$framesz+16*9`(%rsp),%xmm15 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx2: + ret +.size ${func}_avx2,.-${func}_avx2 +___ +}} +}} +{{ +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my ($rounds,$Tbl)=("%r11d","%rbx"); + +my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); +my @rndkey=("%xmm4","%xmm5"); +my $r=0; +my $sn=0; + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); +my @MSG=map("%xmm$_",(10..13)); + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + movups `16*$n`($in0),$in # load input + xorps $rndkey0,$in +___ + $code.=<<___ if ($n); + movups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + xorps $in,$iv + movups `32+16*$k-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Laesenclast$sn + movups `32+16*($k+0)-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+1)-112`($key),$rndkey[0] + aesenc $rndkey[1],$iv + je .Laesenclast$sn + movups `32+16*($k+2)-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+3)-112`($key),$rndkey[0] + aesenc $rndkey[1],$iv +.Laesenclast$sn: + aesenclast $rndkey[0],$iv + movups 16-112($key),$rndkey[1] # forward reference + nop +___ + } else { + $code.=<<___; + movups `32+16*$k-112`($key),$rndkey[1] + aesenc $rndkey[0],$iv +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +if ($shaext) { +my $Tbl="%rax"; + +$code.=<<___; +.type ${func}_shaext,\@function,6 +.align 32 +${func}_shaext: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument +___ +$code.=<<___ if ($win64); + lea `-8-10*16`(%rsp),%rsp + movaps %xmm6,-8-10*16(%rax) + movaps %xmm7,-8-9*16(%rax) + movaps %xmm8,-8-8*16(%rax) + movaps %xmm9,-8-7*16(%rax) + movaps %xmm10,-8-6*16(%rax) + movaps %xmm11,-8-5*16(%rax) + movaps %xmm12,-8-4*16(%rax) + movaps %xmm13,-8-3*16(%rax) + movaps %xmm14,-8-2*16(%rax) + movaps %xmm15,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x200-0x80($Tbl),$TMP # byte swap mask + + mov 240($key),$rounds + sub $in0,$out + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + lea 112($key),$key # size optimization + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + movdqa $ABEF,$ABEF_SAVE # offload +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 1*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + lea 0x40($inp),$inp +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 2*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256msg1 @MSG[1],@MSG[0] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + paddd $TMP,@MSG[0] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 3*32-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256msg1 @MSG[2],@MSG[1] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + paddd $TMP,@MSG[1] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { + &$aesenc() if (($r%10)==0); +$code.=<<___; + movdqa $i*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256msg1 @MSG[3],@MSG[2] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + paddd $TMP,@MSG[2] +___ + &$aesenc(); + &$aesenc() if ($r==19); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256msg1 @MSG[3],@MSG[2] +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + paddd $TMP,@MSG[2] +___ + &$aesenc(); + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 14*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + + movdqa 15*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi +___ + &$aesenc(); + &$aesenc(); +$code.=<<___; + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi +___ + &$aesenc(); +$code.=<<___; + sha256rnds2 $CDGH,$ABEF + #pxor $CDGH,$rndkey0 # black magic +___ + while ($r<40) { &$aesenc(); } # remaining aesenc's +$code.=<<___; + #xorps $CDGH,$rndkey0 # black magic + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + + dec $len + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movups $iv,($ivp) # write IV + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 0*16(%rsp),%xmm6 + movaps 1*16(%rsp),%xmm7 + movaps 2*16(%rsp),%xmm8 + movaps 3*16(%rsp),%xmm9 + movaps 4*16(%rsp),%xmm10 + movaps 5*16(%rsp),%xmm11 + movaps 6*16(%rsp),%xmm12 + movaps 7*16(%rsp),%xmm13 + movaps 8*16(%rsp),%xmm14 + movaps 9*16(%rsp),%xmm15 + lea 8+10*16(%rsp),%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size ${func}_shaext,.-${func}_shaext +___ +} +}}}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ if ($avx); +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HanderlData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue +___ +$code.=<<___ if ($shaext); + lea aesni_cbc_sha256_enc_shaext(%rip),%r10 + cmp %r10,%rbx + jb .Lnot_in_shaext + + lea (%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea 168(%rax),%rax # adjust stack pointer + jmp .Lin_prologue +.Lnot_in_shaext: +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->RipRbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + + lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata + .rva .LSEH_begin_${func}_xop + .rva .LSEH_end_${func}_xop + .rva .LSEH_info_${func}_xop + + .rva .LSEH_begin_${func}_avx + .rva .LSEH_end_${func}_avx + .rva .LSEH_info_${func}_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ +$code.=<<___ if ($shaext); + .rva .LSEH_begin_${func}_shaext + .rva .LSEH_end_${func}_shaext + .rva .LSEH_info_${func}_shaext +___ +$code.=<<___ if ($avx); +.section .xdata +.align 8 +.LSEH_info_${func}_xop: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] + +.LSEH_info_${func}_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ +$code.=<<___ if ($shaext); +.LSEH_info_${func}_shaext: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] +___ +} + +#################################################################### +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + unshift @opcode,$rex|0x40 if($rex); +} + +{ + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + sub sha256op38 { + my $instr = shift; + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } + } +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; +print $code; +close STDOUT; diff --git a/openssl/crypto/aes/asm/aesni-x86.pl b/openssl/crypto/aes/asm/aesni-x86.pl index 3dc345b58..3deb86aed 100644 --- a/openssl/crypto/aes/asm/aesni-x86.pl +++ b/openssl/crypto/aes/asm/aesni-x86.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -43,6 +43,17 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.37 1.37 1.52 1.27 +# * Bridge 5.07/0.98 0.99 1.09 0.91 +# Haswell 4.44/0.80 0.97 1.03 0.72 +# Atom 5.77/3.56 3.67 4.03 3.46 +# Bulldozer 5.80/0.98 1.05 1.24 0.93 + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) @@ -54,8 +65,8 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -if ($PREFIX eq "aesni") { $movekey=*movups; } -else { $movekey=*movups; } +if ($PREFIX eq "aesni") { $movekey=\&movups; } +else { $movekey=\&movups; } $len="eax"; $rounds="ecx"; @@ -196,37 +207,71 @@ sub aesni_generate1 # fully unrolled loop # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x, but it's unfeasible to accommodate it -# in XMM registers addreassable in 32-bit mode and therefore 6x is -# used instead... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge, but it's unfeasible to accommodate such implementation +# in XMM registers addreassable in 32-bit mode and therefore maximum +# of 6x is used instead... + +sub aesni_generate2 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt2"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); + + &set_label("${p}2_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}2_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt2"); +} sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); + &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; @@ -248,27 +293,29 @@ sub aesni_generate4 &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); - &shr ($rounds,1); - &lea ($key,&DWP(32,$key)); + &shl ($rounds,4); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &data_byte (0x0f,0x1f,0x40,0x00); + &add ($rounds,16); &set_label("${p}4_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}4_loop")); eval"&aes${p} ($inout0,$rndkey1)"; @@ -289,43 +336,43 @@ sub aesni_generate6 &function_begin_B("_aesni_${p}rypt6"); &static_label("_aesni_${p}rypt6_enter"); &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); + &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); # pxor does better here - eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout2,$rndkey0); - eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout3,$rndkey0); - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout4,$rndkey0); - eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout5,$rndkey0); + &add ($rounds,16); + eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; - &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout5,$rndkey1)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jmp (&label("_aesni_${p}rypt6_enter")); &set_label("${p}6_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; - &set_label("_aesni_${p}rypt6_enter",16); - &$movekey ($rndkey1,&QWP(16,$key)); + &set_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; eval"&aes${p} ($inout4,$rndkey0)"; eval"&aes${p} ($inout5,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}6_loop")); eval"&aes${p} ($inout0,$rndkey1)"; @@ -343,6 +390,8 @@ sub aesni_generate6 &ret(); &function_end_B("_aesni_${p}rypt6"); } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -446,8 +495,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -547,8 +595,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -610,11 +657,13 @@ if ($PREFIX eq "aesni") { &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); - &shr ($rounds,1); + &shl ($rounds,4); + &mov ($rounds_,16); &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); &movdqa ($inout0,$ivec); - &mov ($rounds_,$rounds); + &lea ($key,&DWP(32,$key,$rounds)); + &sub ($rounds_,$rounds); &pshufb ($ivec,$inout3); &set_label("ccm64_enc_outer"); @@ -625,33 +674,31 @@ if ($PREFIX eq "aesni") { &xorps ($inout0,$rndkey0); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($rndkey0,$in0); - &lea ($key,&DWP(32,$key_)); &xorps ($cmac,$rndkey0); # cmac^=inp - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key_)); &set_label("ccm64_enc2_loop"); &aesenc ($inout0,$rndkey1); - &dec ($rounds); &aesenc ($cmac,$rndkey1); - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); &aesenc ($inout0,$rndkey0); - &lea ($key,&DWP(32,$key)); &aesenc ($cmac,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("ccm64_enc2_loop")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &paddq ($ivec,&QWP(16,"esp")); + &dec ($len); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); - &dec ($len); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movups (&QWP(0,$out),$in0); # save output - &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); + &lea ($out,&DWP(16,$out)); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); @@ -700,15 +747,19 @@ if ($PREFIX eq "aesni") { { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } + &shl ($rounds_,4); + &mov ($rounds,16); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &lea ($inp,&QWP(16,$inp)); + &sub ($rounds,$rounds_); + &lea ($key,&DWP(32,$key_,$rounds_)); + &mov ($rounds_,$rounds); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_outer",16); &xorps ($in0,$inout0); # inp ^= E(ivec) &movdqa ($inout0,$ivec); - &mov ($rounds,$rounds_); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); @@ -717,34 +768,33 @@ if ($PREFIX eq "aesni") { &jz (&label("ccm64_dec_break")); &$movekey ($rndkey0,&QWP(0,$key_)); - &shr ($rounds,1); + &mov ($rounds,$rounds_); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key_)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(32,$key_)); &set_label("ccm64_dec2_loop"); &aesenc ($inout0,$rndkey1); - &dec ($rounds); &aesenc ($cmac,$rndkey1); - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); &aesenc ($inout0,$rndkey0); - &lea ($key,&DWP(32,$key)); &aesenc ($cmac,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("ccm64_dec2_loop")); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); - &lea ($inp,&QWP(16,$inp)); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); + &lea ($inp,&QWP(16,$inp)); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); + &mov ($rounds,&DWP(240,$key_)); &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } @@ -763,7 +813,7 @@ if ($PREFIX eq "aesni") { # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and -# does not update *ivec! (see engine/eng_aesni.c for details) +# does not update *ivec! (see crypto/modes/ctr128.c for details) # # stack layout: # 0 pshufb mask @@ -810,66 +860,61 @@ if ($PREFIX eq "aesni") { # compose 2 vectors of 3x32-bit counters &bswap ($rounds_); - &pxor ($rndkey1,$rndkey1); &pxor ($rndkey0,$rndkey0); + &pxor ($rndkey1,$rndkey1); &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask - &pinsrd ($rndkey1,$rounds_,0); + &pinsrd ($rndkey0,$rounds_,0); &lea ($key_,&DWP(3,$rounds_)); - &pinsrd ($rndkey0,$key_,0); + &pinsrd ($rndkey1,$key_,0); &inc ($rounds_); - &pinsrd ($rndkey1,$rounds_,1); + &pinsrd ($rndkey0,$rounds_,1); &inc ($key_); - &pinsrd ($rndkey0,$key_,1); + &pinsrd ($rndkey1,$key_,1); &inc ($rounds_); - &pinsrd ($rndkey1,$rounds_,2); + &pinsrd ($rndkey0,$rounds_,2); &inc ($key_); - &pinsrd ($rndkey0,$key_,2); - &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet - &pshufb ($rndkey1,$inout0); # byte swap - &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pinsrd ($rndkey1,$key_,2); + &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet &pshufb ($rndkey0,$inout0); # byte swap + &movdqu ($inout4,&QWP(0,$key)); # key[0] + &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet + &pshufb ($rndkey1,$inout0); # byte swap - &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword - &pshufd ($inout1,$rndkey1,2<<6); + &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword + &pshufd ($inout1,$rndkey0,2<<6); &cmp ($len,6); &jb (&label("ctr32_tail")); - &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec - &shr ($rounds,1); + &pxor ($inout5,$inout4); # counter-less ivec^key[0] + &shl ($rounds,4); + &mov ($rounds_,16); + &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds + &sub ($rounds_,$rounds); # backup twisted $rounds + &lea ($key,&DWP(32,$key,$rounds)); &sub ($len,6); &jmp (&label("ctr32_loop6")); &set_label("ctr32_loop6",16); - &pshufd ($inout2,$rndkey1,1<<6); - &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec - &pshufd ($inout3,$rndkey0,3<<6); - &por ($inout0,$rndkey1); # merge counter-less ivec - &pshufd ($inout4,$rndkey0,2<<6); - &por ($inout1,$rndkey1); - &pshufd ($inout5,$rndkey0,1<<6); - &por ($inout2,$rndkey1); - &por ($inout3,$rndkey1); - &por ($inout4,$rndkey1); - &por ($inout5,$rndkey1); - - # inlining _aesni_encrypt6's prologue gives ~4% improvement... - &$movekey ($rndkey0,&QWP(0,$key_)); - &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); - &dec ($rounds); - &pxor ($inout0,$rndkey0); + # inlining _aesni_encrypt6's prologue gives ~6% improvement... + &pshufd ($inout2,$rndkey0,1<<6); + &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec + &pshufd ($inout3,$rndkey1,3<<6); + &pxor ($inout0,$rndkey0); # merge counter-less ivec + &pshufd ($inout4,$rndkey1,2<<6); &pxor ($inout1,$rndkey0); - &aesenc ($inout0,$rndkey1); + &pshufd ($inout5,$rndkey1,1<<6); + &$movekey ($rndkey1,&QWP(16,$key_)); &pxor ($inout2,$rndkey0); - &aesenc ($inout1,$rndkey1); &pxor ($inout3,$rndkey0); - &aesenc ($inout2,$rndkey1); + &aesenc ($inout0,$rndkey1); &pxor ($inout4,$rndkey0); - &aesenc ($inout3,$rndkey1); &pxor ($inout5,$rndkey0); + &aesenc ($inout1,$rndkey1); + &$movekey ($rndkey0,&QWP(32,$key_)); + &mov ($rounds,$rounds_); + &aesenc ($inout2,$rndkey1); + &aesenc ($inout3,$rndkey1); &aesenc ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); &aesenc ($inout5,$rndkey1); &call (&label("_aesni_encrypt6_enter")); @@ -882,12 +927,12 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0,$out),$inout0); &movdqa ($rndkey0,&QWP(16,"esp")); # load increment &xorps ($inout2,$rndkey1); - &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet + &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); - &paddd ($rndkey1,$rndkey0); # 1st triplet increment - &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment + &paddd ($rndkey1,$rndkey0); # 2nd triplet increment + &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask &movups ($inout1,&QWP(0x30,$inp)); @@ -895,44 +940,44 @@ if ($PREFIX eq "aesni") { &xorps ($inout3,$inout1); &movups ($inout1,&QWP(0x50,$inp)); &lea ($inp,&DWP(0x60,$inp)); - &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet - &pshufb ($rndkey1,$inout0); # byte swap + &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet + &pshufb ($rndkey0,$inout0); # byte swap &xorps ($inout4,$inout2); &movups (&QWP(0x30,$out),$inout3); &xorps ($inout5,$inout1); - &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet - &pshufb ($rndkey0,$inout0); # byte swap + &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet + &pshufb ($rndkey1,$inout0); # byte swap &movups (&QWP(0x40,$out),$inout4); - &pshufd ($inout0,$rndkey1,3<<6); + &pshufd ($inout0,$rndkey0,3<<6); &movups (&QWP(0x50,$out),$inout5); &lea ($out,&DWP(0x60,$out)); - &mov ($rounds,$rounds_); - &pshufd ($inout1,$rndkey1,2<<6); + &pshufd ($inout1,$rndkey0,2<<6); &sub ($len,6); &jnc (&label("ctr32_loop6")); &add ($len,6); &jz (&label("ctr32_ret")); + &movdqu ($inout5,&QWP(0,$key_)); &mov ($key,$key_); - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds - &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec + &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec + &mov ($rounds,&DWP(240,$key_)); # restore $rounds &set_label("ctr32_tail"); &por ($inout0,$inout5); &cmp ($len,2); &jb (&label("ctr32_one")); - &pshufd ($inout2,$rndkey1,1<<6); + &pshufd ($inout2,$rndkey0,1<<6); &por ($inout1,$inout5); &je (&label("ctr32_two")); - &pshufd ($inout3,$rndkey0,3<<6); + &pshufd ($inout3,$rndkey1,3<<6); &por ($inout2,$inout5); &cmp ($len,4); &jb (&label("ctr32_three")); - &pshufd ($inout4,$rndkey0,2<<6); + &pshufd ($inout4,$rndkey1,2<<6); &por ($inout3,$inout5); &je (&label("ctr32_four")); @@ -970,7 +1015,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); @@ -1057,8 +1102,10 @@ if ($PREFIX eq "aesni") { &sub ($len,16*6); &jc (&label("xts_enc_short")); - &shr ($rounds,1); - &mov ($rounds_,$rounds); + &shl ($rounds,4); + &mov ($rounds_,16); + &sub ($rounds_,$rounds); + &lea ($key,&DWP(32,$key,$rounds)); &jmp (&label("xts_enc_loop6")); &set_label("xts_enc_loop6",16); @@ -1080,6 +1127,7 @@ if ($PREFIX eq "aesni") { &pxor ($inout5,$tweak); # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &mov ($rounds,$rounds_); # restore $rounds &movdqu ($inout1,&QWP(16*1,$inp)); &xorps ($inout0,$rndkey0); # input^=rndkey[0] &movdqu ($inout2,&QWP(16*2,$inp)); @@ -1096,19 +1144,17 @@ if ($PREFIX eq "aesni") { &pxor ($inout5,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); &pxor ($inout1,&QWP(16*1,"esp")); - &aesenc ($inout0,$rndkey1); &pxor ($inout2,&QWP(16*2,"esp")); - &aesenc ($inout1,$rndkey1); + &aesenc ($inout0,$rndkey1); &pxor ($inout3,&QWP(16*3,"esp")); - &dec ($rounds); - &aesenc ($inout2,$rndkey1); &pxor ($inout4,&QWP(16*4,"esp")); - &aesenc ($inout3,$rndkey1); + &aesenc ($inout1,$rndkey1); &pxor ($inout5,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key_)); + &aesenc ($inout2,$rndkey1); + &aesenc ($inout3,$rndkey1); &aesenc ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); &aesenc ($inout5,$rndkey1); &call (&label("_aesni_encrypt6_enter")); @@ -1135,13 +1181,12 @@ if ($PREFIX eq "aesni") { &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov ($rounds,$rounds_); # restore $rounds &pxor ($tweak,$twres); &sub ($len,16*6); &jnc (&label("xts_enc_loop6")); - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($rounds,&DWP(240,$key_)); # restore $rounds &mov ($key,$key_); # restore $key &mov ($rounds_,$rounds); @@ -1241,9 +1286,8 @@ if ($PREFIX eq "aesni") { &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); @@ -1399,8 +1443,10 @@ if ($PREFIX eq "aesni") { &sub ($len,16*6); &jc (&label("xts_dec_short")); - &shr ($rounds,1); - &mov ($rounds_,$rounds); + &shl ($rounds,4); + &mov ($rounds_,16); + &sub ($rounds_,$rounds); + &lea ($key,&DWP(32,$key,$rounds)); &jmp (&label("xts_dec_loop6")); &set_label("xts_dec_loop6",16); @@ -1422,6 +1468,7 @@ if ($PREFIX eq "aesni") { &pxor ($inout5,$tweak); # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &mov ($rounds,$rounds_); &movdqu ($inout1,&QWP(16*1,$inp)); &xorps ($inout0,$rndkey0); # input^=rndkey[0] &movdqu ($inout2,&QWP(16*2,$inp)); @@ -1438,19 +1485,17 @@ if ($PREFIX eq "aesni") { &pxor ($inout5,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); &pxor ($inout1,&QWP(16*1,"esp")); - &aesdec ($inout0,$rndkey1); &pxor ($inout2,&QWP(16*2,"esp")); - &aesdec ($inout1,$rndkey1); + &aesdec ($inout0,$rndkey1); &pxor ($inout3,&QWP(16*3,"esp")); - &dec ($rounds); - &aesdec ($inout2,$rndkey1); &pxor ($inout4,&QWP(16*4,"esp")); - &aesdec ($inout3,$rndkey1); + &aesdec ($inout1,$rndkey1); &pxor ($inout5,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key_)); + &aesdec ($inout2,$rndkey1); + &aesdec ($inout3,$rndkey1); &aesdec ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); &aesdec ($inout5,$rndkey1); &call (&label("_aesni_decrypt6_enter")); @@ -1477,13 +1522,12 @@ if ($PREFIX eq "aesni") { &paddq ($tweak,$tweak); # &psllq($tweak,1); &pand ($twres,$twmask); # isolate carry and residue &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov ($rounds,$rounds_); # restore $rounds &pxor ($tweak,$twres); &sub ($len,16*6); &jnc (&label("xts_dec_loop6")); - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($rounds,&DWP(240,$key_)); # restore $rounds &mov ($key,$key_); # restore $key &mov ($rounds_,$rounds); @@ -1584,7 +1628,7 @@ if ($PREFIX eq "aesni") { &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); @@ -1816,7 +1860,7 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x10,$out),$inout1); &lea ($inp,&DWP(0x60,$inp)); &movups (&QWP(0x20,$out),$inout2); - &mov ($rounds,$rounds_) # restore $rounds + &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(0x30,$out),$inout3); &mov ($key,$key_); # restore $key &movups (&QWP(0x40,$out),$inout4); @@ -1884,8 +1928,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); @@ -2015,7 +2058,7 @@ if ($PREFIX eq "aesni") { &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey &mov ($rounds,11); - &$movekey (&QWP(-16,$key),"xmm0") # round 0 + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 @@ -2152,7 +2195,7 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); - &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key + &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule diff --git a/openssl/crypto/aes/asm/aesni-x86_64.pl b/openssl/crypto/aes/asm/aesni-x86_64.pl index c9270dfdd..5f6174635 100644 --- a/openssl/crypto/aes/asm/aesni-x86_64.pl +++ b/openssl/crypto/aes/asm/aesni-x86_64.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -129,8 +129,8 @@ # # Further data for other parallelizable modes: # -# CBC decrypt 1.16 0.93 0.93 -# CTR 1.14 0.91 n/a +# CBC decrypt 1.16 0.93 0.74 +# CTR 1.14 0.91 0.74 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? @@ -153,10 +153,25 @@ # April 2011 # -# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing -# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like +# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.25 1.25 1.25 1.26 +# * Bridge 5.07/0.74 0.75 0.90 0.85 +# Haswell 4.44/0.63 0.63 0.73 0.63 +# Atom 5.75/3.54 3.56 4.12 3.87(*) +# Bulldozer 5.77/0.70 0.72 0.90 0.70 +# +# (*) Atom ECB result is suboptimal because of penalties incurred +# by operations on %xmm8-15. As ECB is not considered +# critical, nothing was done to mitigate the problem. + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) @@ -180,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; +$code.=".extern OPENSSL_ia32cap_P\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... @@ -272,10 +288,49 @@ ___ # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge and "super-optimal" for other Intel CPUs... + +sub aesni_generate2 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-1] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt2,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt2: + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax + +.L${dir}_loop2: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop2 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + ret +.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 +___ +} sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -285,25 +340,26 @@ $code.=<<___; .align 16 _aesni_${dir}rypt3: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 - $movkey ($key),$rndkey0 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax .L${dir}_loop3: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop3 aes${dir} $rndkey1,$inout0 @@ -329,28 +385,30 @@ $code.=<<___; .align 16 _aesni_${dir}rypt4: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 xorps $rndkey0,$inout3 - $movkey ($key),$rndkey0 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + .byte 0x0f,0x1f,0x00 + add \$16,%rax .L${dir}_loop4: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop4 aes${dir} $rndkey1,$inout0 @@ -374,43 +432,43 @@ $code.=<<___; .align 16 _aesni_${dir}rypt6: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 pxor $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout0 + lea 32($key,$rounds),$key + neg %rax # $rounds aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout5 - dec $rounds + add \$16,%rax + aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout5 + $movkey -16($key,%rax),$rndkey0 jmp .L${dir}_loop6_enter .align 16 .L${dir}_loop6: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 -.L${dir}_loop6_enter: # happens to be 16-byte aligned - $movkey 16($key),$rndkey1 +.L${dir}_loop6_enter: + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop6 aes${dir} $rndkey1,$inout0 @@ -438,52 +496,51 @@ $code.=<<___; .align 16 _aesni_${dir}rypt8: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout2 - aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 + lea 32($key,$rounds),$key + neg %rax # $rounds + aes${dir} $rndkey1,$inout0 + add \$16,%rax pxor $rndkey0,$inout5 - dec $rounds - aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout6 - aes${dir} $rndkey1,$inout5 pxor $rndkey0,$inout7 - $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 + $movkey -16($key,%rax),$rndkey0 jmp .L${dir}_loop8_enter .align 16 .L${dir}_loop8: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 -.L${dir}_loop8_enter: # happens to be 16-byte aligned +.L${dir}_loop8_enter: + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 aes${dir} $rndkey0,$inout6 aes${dir} $rndkey0,$inout7 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop8 aes${dir} $rndkey1,$inout0 @@ -506,6 +563,8 @@ _aesni_${dir}rypt8: .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -637,8 +696,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: - xorps $inout2,$inout2 - call _aesni_encrypt3 + call _aesni_encrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -774,8 +832,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 + call _aesni_decrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -842,7 +899,8 @@ ___ { my $cmac="%r9"; # 6th argument -my $increment="%xmm6"; +my $increment="%xmm9"; +my $iv="%xmm6"; my $bswap_mask="%xmm7"; $code.=<<___; @@ -865,49 +923,49 @@ $code.=<<___; movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - shr \$1,$rounds + shl \$4,$rounds + mov \$16,$rnds_ lea 0($key),$key_ movdqu ($cmac),$inout1 movdqa $iv,$inout0 - mov $rounds,$rnds_ + lea 32($key,$rounds),$key # end of key schedule pshufb $bswap_mask,$iv + sub %rax,%r10 # twisted $rounds jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: $movkey ($key_),$rndkey0 - mov $rnds_,$rounds + mov %r10,%rax movups ($inp),$in0 # load inp xorps $rndkey0,$inout0 # counter $movkey 16($key_),$rndkey1 xorps $in0,$rndkey0 - lea 32($key_),$key xorps $rndkey0,$inout1 # cmac^=inp - $movkey ($key),$rndkey0 + $movkey 32($key_),$rndkey0 .Lccm64_enc2_loop: aesenc $rndkey1,$inout0 - dec $rounds aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aesenc $rndkey0,$inout0 - lea 32($key),$key aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .Lccm64_enc2_loop aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 paddq $increment,$iv + dec $len aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 - dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output - lea 16($out),$out pshufb $bswap_mask,$inout0 + lea 16($out),$out jnz .Lccm64_enc_outer movups $inout1,($cmac) @@ -953,15 +1011,19 @@ $code.=<<___; ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; + shl \$4,$rnds_ + mov \$16,$rounds movups ($inp),$in0 # load inp paddq $increment,$iv lea 16($inp),$inp + sub %r10,%rax # twisted $rounds + lea 32($key_,$rnds_),$key # end of key schedule + mov %rax,%r10 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 - mov $rnds_,$rounds movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 @@ -970,36 +1032,36 @@ $code.=<<___; jz .Lccm64_dec_break $movkey ($key_),$rndkey0 - shr \$1,$rounds + mov %r10,%rax $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 - lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out - $movkey ($key),$rndkey0 - + $movkey 32($key_),$rndkey0 + jmp .Lccm64_dec2_loop +.align 16 .Lccm64_dec2_loop: aesenc $rndkey1,$inout0 - dec $rounds aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aesenc $rndkey0,$inout0 - lea 32($key),$key aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .Lccm64_dec2_loop movups ($inp),$in0 # load inp paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - lea 16($inp),$inp aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 + lea 16($inp),$inp jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: #xorps $in0,$inout1 # cmac^=out + mov 240($key_),$rounds ___ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; @@ -1024,220 +1086,479 @@ ___ # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and -# does not update *ivec! (see engine/eng_aesni.c for details) +# does not update *ivec! (see crypto/modes/ctr128.c for details) # +# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, +# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. +# Keywords are full unroll and modulo-schedule counter calculations +# with zero-round key xor. { -my $reserved = $win64?0:-0x28; -my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); -my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); -my $bswap_mask="%xmm15"; +my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); +my ($key0,$ctr)=("${key_}d","${ivp}d"); +my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: + lea (%rsp),%rax + push %rbp + sub \$$frame_size,%rsp + and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - lea -0xc8(%rsp),%rsp - movaps %xmm6,0x20(%rsp) - movaps %xmm7,0x30(%rsp) - movaps %xmm8,0x40(%rsp) - movaps %xmm9,0x50(%rsp) - movaps %xmm10,0x60(%rsp) - movaps %xmm11,0x70(%rsp) - movaps %xmm12,0x80(%rsp) - movaps %xmm13,0x90(%rsp) - movaps %xmm14,0xa0(%rsp) - movaps %xmm15,0xb0(%rsp) + movaps %xmm6,-0xa8(%rax) + movaps %xmm7,-0x98(%rax) + movaps %xmm8,-0x88(%rax) + movaps %xmm9,-0x78(%rax) + movaps %xmm10,-0x68(%rax) + movaps %xmm11,-0x58(%rax) + movaps %xmm12,-0x48(%rax) + movaps %xmm13,-0x38(%rax) + movaps %xmm14,-0x28(%rax) + movaps %xmm15,-0x18(%rax) .Lctr32_body: ___ $code.=<<___; + lea -8(%rax),%rbp + cmp \$1,$len je .Lctr32_one_shortcut - movdqu ($ivp),$ivec - movdqa .Lbswap_mask(%rip),$bswap_mask - xor $rounds,$rounds - pextrd \$3,$ivec,$rnds_ # pull 32-bit counter - pinsrd \$3,$rounds,$ivec # wipe 32-bit counter - + movdqu ($ivp),$inout0 + movdqu ($key),$rndkey0 + mov 12($ivp),$ctr # counter LSB + pxor $rndkey0,$inout0 + mov 12($key),$key0 # 0-round key LSB + movdqa $inout0,0x00(%rsp) # populate counter block + bswap $ctr + movdqa $inout0,$inout1 + movdqa $inout0,$inout2 + movdqa $inout0,$inout3 + movdqa $inout0,0x40(%rsp) + movdqa $inout0,0x50(%rsp) + movdqa $inout0,0x60(%rsp) + mov %rdx,%r10 # borrow %rdx + movdqa $inout0,0x70(%rsp) + + lea 1($ctr),%rax + lea 2($ctr),%rdx + bswap %eax + bswap %edx + xor $key0,%eax + xor $key0,%edx + pinsrd \$3,%eax,$inout1 + lea 3($ctr),%rax + movdqa $inout1,0x10(%rsp) + pinsrd \$3,%edx,$inout2 + bswap %eax + mov %r10,%rdx # restore %rdx + lea 4($ctr),%r10 + movdqa $inout2,0x20(%rsp) + xor $key0,%eax + bswap %r10d + pinsrd \$3,%eax,$inout3 + xor $key0,%r10d + movdqa $inout3,0x30(%rsp) + lea 5($ctr),%r9 + mov %r10d,0x40+12(%rsp) + bswap %r9d + lea 6($ctr),%r10 mov 240($key),$rounds # key->rounds - bswap $rnds_ - pxor $iv0,$iv0 # vector of 3 32-bit counters - pxor $iv1,$iv1 # vector of 3 32-bit counters - pinsrd \$0,$rnds_,$iv0 - lea 3($rnds_),$key_ - pinsrd \$0,$key_,$iv1 - inc $rnds_ - pinsrd \$1,$rnds_,$iv0 - inc $key_ - pinsrd \$1,$key_,$iv1 - inc $rnds_ - pinsrd \$2,$rnds_,$iv0 - inc $key_ - pinsrd \$2,$key_,$iv1 - movdqa $iv0,$reserved(%rsp) - pshufb $bswap_mask,$iv0 - movdqa $iv1,`$reserved+0x10`(%rsp) - pshufb $bswap_mask,$iv1 - - pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword - pshufd \$`2<<6`,$iv0,$inout1 - pshufd \$`1<<6`,$iv0,$inout2 - cmp \$6,$len + xor $key0,%r9d + bswap %r10d + mov %r9d,0x50+12(%rsp) + xor $key0,%r10d + lea 7($ctr),%r9 + mov %r10d,0x60+12(%rsp) + bswap %r9d + mov OPENSSL_ia32cap_P+4(%rip),%r10d + xor $key0,%r9d + and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE + mov %r9d,0x70+12(%rsp) + + $movkey 0x10($key),$rndkey1 + + movdqa 0x40(%rsp),$inout4 + movdqa 0x50(%rsp),$inout5 + + cmp \$8,$len jb .Lctr32_tail - shr \$1,$rounds - mov $key,$key_ # backup $key - mov $rounds,$rnds_ # backup $rounds + sub \$6,$len + cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE + je .Lctr32_6x + + lea 0x80($key),$key # size optimization + sub \$2,$len + jmp .Lctr32_loop8 + +.align 16 +.Lctr32_6x: + shl \$4,$rounds + mov \$48,$rnds_ + bswap $key0 + lea 32($key,$rounds),$key # end of key schedule + sub %rax,%r10 # twisted $rounds jmp .Lctr32_loop6 .align 16 .Lctr32_loop6: - pshufd \$`3<<6`,$iv1,$inout3 - por $ivec,$inout0 # merge counter-less ivec - $movkey ($key_),$rndkey0 - pshufd \$`2<<6`,$iv1,$inout4 - por $ivec,$inout1 - $movkey 16($key_),$rndkey1 - pshufd \$`1<<6`,$iv1,$inout5 - por $ivec,$inout2 - por $ivec,$inout3 - xorps $rndkey0,$inout0 - por $ivec,$inout4 - por $ivec,$inout5 - - # inline _aesni_encrypt6 and interleave last rounds - # with own code... + add \$6,$ctr + $movkey -48($key,$rnds_),$rndkey0 + aesenc $rndkey1,$inout0 + mov $ctr,%eax + xor $key0,%eax + aesenc $rndkey1,$inout1 + movbe %eax,`0x00+12`(%rsp) + lea 1($ctr),%eax + aesenc $rndkey1,$inout2 + xor $key0,%eax + movbe %eax,`0x10+12`(%rsp) + aesenc $rndkey1,$inout3 + lea 2($ctr),%eax + xor $key0,%eax + aesenc $rndkey1,$inout4 + movbe %eax,`0x20+12`(%rsp) + lea 3($ctr),%eax + aesenc $rndkey1,$inout5 + $movkey -32($key,$rnds_),$rndkey1 + xor $key0,%eax - pxor $rndkey0,$inout1 + aesenc $rndkey0,$inout0 + movbe %eax,`0x30+12`(%rsp) + lea 4($ctr),%eax + aesenc $rndkey0,$inout1 + xor $key0,%eax + movbe %eax,`0x40+12`(%rsp) + aesenc $rndkey0,$inout2 + lea 5($ctr),%eax + xor $key0,%eax + aesenc $rndkey0,$inout3 + movbe %eax,`0x50+12`(%rsp) + mov %r10,%rax # mov $rnds_,$rounds + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey -16($key,$rnds_),$rndkey0 + + call .Lenc_loop6 + + movdqu ($inp),$inout6 + movdqu 0x10($inp),$inout7 + movdqu 0x20($inp),$in0 + movdqu 0x30($inp),$in1 + movdqu 0x40($inp),$in2 + movdqu 0x50($inp),$in3 + lea 0x60($inp),$inp + $movkey -64($key,$rnds_),$rndkey1 + pxor $inout0,$inout6 + movaps 0x00(%rsp),$inout0 + pxor $inout1,$inout7 + movaps 0x10(%rsp),$inout1 + pxor $inout2,$in0 + movaps 0x20(%rsp),$inout2 + pxor $inout3,$in1 + movaps 0x30(%rsp),$inout3 + pxor $inout4,$in2 + movaps 0x40(%rsp),$inout4 + pxor $inout5,$in3 + movaps 0x50(%rsp),$inout5 + movdqu $inout6,($out) + movdqu $inout7,0x10($out) + movdqu $in0,0x20($out) + movdqu $in1,0x30($out) + movdqu $in2,0x40($out) + movdqu $in3,0x50($out) + lea 0x60($out),$out + + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + + lea -48($rnds_),$rounds + lea -80($key,$rnds_),$key # restore $key + neg $rounds + shr \$4,$rounds # restore $rounds + jmp .Lctr32_tail + +.align 32 +.Lctr32_loop8: + add \$8,$ctr + movdqa 0x60(%rsp),$inout6 aesenc $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 + mov $ctr,%r9d + movdqa 0x70(%rsp),$inout7 aesenc $rndkey1,$inout1 - movdqa .Lincrement32(%rip),$iv1 - pxor $rndkey0,$inout3 + bswap %r9d + $movkey 0x20-0x80($key),$rndkey0 aesenc $rndkey1,$inout2 - movdqa $reserved(%rsp),$iv0 - pxor $rndkey0,$inout4 + xor $key0,%r9d + nop aesenc $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds + mov %r9d,0x00+12(%rsp) + lea 1($ctr),%r9 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 - jmp .Lctr32_enc_loop6_enter -.align 16 -.Lctr32_enc_loop6: + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0x30-0x80($key),$rndkey1 +___ +for($i=2;$i<8;$i++) { +my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; +$code.=<<___; + bswap %r9d + aesenc $rndkeyx,$inout0 + aesenc $rndkeyx,$inout1 + xor $key0,%r9d + .byte 0x66,0x90 + aesenc $rndkeyx,$inout2 + aesenc $rndkeyx,$inout3 + mov %r9d,`0x10*($i-1)`+12(%rsp) + lea $i($ctr),%r9 + aesenc $rndkeyx,$inout4 + aesenc $rndkeyx,$inout5 + aesenc $rndkeyx,$inout6 + aesenc $rndkeyx,$inout7 + $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx +___ +} +$code.=<<___; + bswap %r9d + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + xor $key0,%r9d + movdqu 0x00($inp),$in0 + aesenc $rndkey0,$inout3 + mov %r9d,0x70+12(%rsp) + cmp \$11,$rounds + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xa0-0x80($key),$rndkey0 + + jb .Lctr32_enc_done + aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - dec $rounds aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 -.Lctr32_enc_loop6_enter: - $movkey 16($key),$rndkey1 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0xb0-0x80($key),$rndkey1 + aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 - lea 32($key),$key aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 - $movkey ($key),$rndkey0 - jnz .Lctr32_enc_loop6 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xc0-0x80($key),$rndkey0 + je .Lctr32_enc_done aesenc $rndkey1,$inout0 - paddd $iv1,$iv0 # increment counter vector aesenc $rndkey1,$inout1 - paddd `$reserved+0x10`(%rsp),$iv1 aesenc $rndkey1,$inout2 - movdqa $iv0,$reserved(%rsp) # save counter vector aesenc $rndkey1,$inout3 - movdqa $iv1,`$reserved+0x10`(%rsp) aesenc $rndkey1,$inout4 - pshufb $bswap_mask,$iv0 # byte swap aesenc $rndkey1,$inout5 - pshufb $bswap_mask,$iv1 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0xd0-0x80($key),$rndkey1 - aesenclast $rndkey0,$inout0 - movups ($inp),$in0 # load input - aesenclast $rndkey0,$inout1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xe0-0x80($key),$rndkey0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 0x10($inp),$in1 + pxor $rndkey0,$in0 + movdqu 0x20($inp),$in2 + pxor $rndkey0,$in1 + movdqu 0x30($inp),$in3 + pxor $rndkey0,$in2 + movdqu 0x40($inp),$in4 + pxor $rndkey0,$in3 + movdqu 0x50($inp),$in5 + pxor $rndkey0,$in4 + pxor $rndkey0,$in5 + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + movdqu 0x60($inp),$rndkey1 + lea 0x80($inp),$inp + + aesenclast $in0,$inout0 + pxor $rndkey0,$rndkey1 + movdqu 0x70-0x80($inp),$in0 + aesenclast $in1,$inout1 + pxor $rndkey0,$in0 + movdqa 0x00(%rsp),$in1 # load next counter block + aesenclast $in2,$inout2 + aesenclast $in3,$inout3 + movdqa 0x10(%rsp),$in2 + movdqa 0x20(%rsp),$in3 + aesenclast $in4,$inout4 + aesenclast $in5,$inout5 + movdqa 0x30(%rsp),$in4 + movdqa 0x40(%rsp),$in5 + aesenclast $rndkey1,$inout6 + movdqa 0x50(%rsp),$rndkey0 + $movkey 0x10-0x80($key),$rndkey1 + aesenclast $in0,$inout7 + + movups $inout0,($out) # store output + movdqa $in1,$inout0 + movups $inout1,0x10($out) + movdqa $in2,$inout1 + movups $inout2,0x20($out) + movdqa $in3,$inout2 + movups $inout3,0x30($out) + movdqa $in4,$inout3 + movups $inout4,0x40($out) + movdqa $in5,$inout4 + movups $inout5,0x50($out) + movdqa $rndkey0,$inout5 + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + + sub \$8,$len + jnc .Lctr32_loop8 + + add \$8,$len + jz .Lctr32_done + lea -0x80($key),$key + +.Lctr32_tail: + lea 16($key),$key + cmp \$4,$len + jb .Lctr32_loop3 + je .Lctr32_loop4 + + shl \$4,$rounds + movdqa 0x60(%rsp),$inout6 + pxor $inout7,$inout7 + + $movkey 16($key),$rndkey0 + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + lea 32-16($key,$rounds),$key + neg %rax + aesenc $rndkey1,$inout2 + add \$16,%rax + movups ($inp),$in0 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + movups 0x10($inp),$in1 + movups 0x20($inp),$in2 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + + call .Lenc_loop8_enter + + movdqu 0x30($inp),$in3 + pxor $in0,$inout0 + movdqu 0x40($inp),$in0 + pxor $in1,$inout1 + movdqu $inout0,($out) + pxor $in2,$inout2 + movdqu $inout1,0x10($out) + pxor $in3,$inout3 + movdqu $inout2,0x20($out) + pxor $in0,$inout4 + movdqu $inout3,0x30($out) + movdqu $inout4,0x40($out) + cmp \$6,$len + jb .Lctr32_done + + movups 0x50($inp),$in1 + xorps $in1,$inout5 + movups $inout5,0x50($out) + je .Lctr32_done + + movups 0x60($inp),$in2 + xorps $in2,$inout6 + movups $inout6,0x60($out) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: + aesenc $rndkey1,$inout0 + lea 16($key),$key + dec $rounds + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey ($key),$rndkey1 + jnz .Lctr32_loop4 + aesenclast $rndkey1,$inout0 + aesenclast $rndkey1,$inout1 + movups ($inp),$in0 movups 0x10($inp),$in1 - aesenclast $rndkey0,$inout2 + aesenclast $rndkey1,$inout2 + aesenclast $rndkey1,$inout3 movups 0x20($inp),$in2 - aesenclast $rndkey0,$inout3 movups 0x30($inp),$in3 - aesenclast $rndkey0,$inout4 - movups 0x40($inp),$rndkey1 - aesenclast $rndkey0,$inout5 - movups 0x50($inp),$rndkey0 - lea 0x60($inp),$inp - - xorps $inout0,$in0 # xor - pshufd \$`3<<6`,$iv0,$inout0 - xorps $inout1,$in1 - pshufd \$`2<<6`,$iv0,$inout1 - movups $in0,($out) # store output - xorps $inout2,$in2 - pshufd \$`1<<6`,$iv0,$inout2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - xorps $inout4,$rndkey1 - movups $in3,0x30($out) - xorps $inout5,$rndkey0 - movups $rndkey1,0x40($out) - movups $rndkey0,0x50($out) - lea 0x60($out),$out - mov $rnds_,$rounds - sub \$6,$len - jnc .Lctr32_loop6 - add \$6,$len - jz .Lctr32_done - mov $key_,$key # restore $key - lea 1($rounds,$rounds),$rounds # restore original value + xorps $in0,$inout0 + movups $inout0,($out) + xorps $in1,$inout1 + movups $inout1,0x10($out) + pxor $in2,$inout2 + movdqu $inout2,0x20($out) + pxor $in3,$inout3 + movdqu $inout3,0x30($out) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: + aesenc $rndkey1,$inout0 + lea 16($key),$key + dec $rounds + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + $movkey ($key),$rndkey1 + jnz .Lctr32_loop3 + aesenclast $rndkey1,$inout0 + aesenclast $rndkey1,$inout1 + aesenclast $rndkey1,$inout2 -.Lctr32_tail: - por $ivec,$inout0 movups ($inp),$in0 + xorps $in0,$inout0 + movups $inout0,($out) cmp \$2,$len - jb .Lctr32_one + jb .Lctr32_done - por $ivec,$inout1 movups 0x10($inp),$in1 - je .Lctr32_two + xorps $in1,$inout1 + movups $inout1,0x10($out) + je .Lctr32_done - pshufd \$`3<<6`,$iv1,$inout3 - por $ivec,$inout2 movups 0x20($inp),$in2 - cmp \$4,$len - jb .Lctr32_three - - pshufd \$`2<<6`,$iv1,$inout4 - por $ivec,$inout3 - movups 0x30($inp),$in3 - je .Lctr32_four - - por $ivec,$inout4 - xorps $inout5,$inout5 - - call _aesni_encrypt6 - - movups 0x40($inp),$rndkey1 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - xorps $inout4,$rndkey1 - movups $in3,0x30($out) - movups $rndkey1,0x40($out) + xorps $in2,$inout2 + movups $inout2,0x20($out) jmp .Lctr32_done .align 16 @@ -1245,64 +1566,32 @@ $code.=<<___; movups ($ivp),$inout0 movups ($inp),$in0 mov 240($key),$rounds # key->rounds -.Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; - xorps $inout0,$in0 - movups $in0,($out) - jmp .Lctr32_done - -.align 16 -.Lctr32_two: - xorps $inout2,$inout2 - call _aesni_encrypt3 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - movups $in1,0x10($out) - jmp .Lctr32_done - -.align 16 -.Lctr32_three: - call _aesni_encrypt3 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - movups $in2,0x20($out) + xorps $in0,$inout0 + movups $inout0,($out) jmp .Lctr32_done .align 16 -.Lctr32_four: - call _aesni_encrypt4 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - movups $in3,0x30($out) - .Lctr32_done: ___ $code.=<<___ if ($win64); - movaps 0x20(%rsp),%xmm6 - movaps 0x30(%rsp),%xmm7 - movaps 0x40(%rsp),%xmm8 - movaps 0x50(%rsp),%xmm9 - movaps 0x60(%rsp),%xmm10 - movaps 0x70(%rsp),%xmm11 - movaps 0x80(%rsp),%xmm12 - movaps 0x90(%rsp),%xmm13 - movaps 0xa0(%rsp),%xmm14 - movaps 0xb0(%rsp),%xmm15 - lea 0xc8(%rsp),%rsp -.Lctr32_ret: + movaps -0xa0(%rbp),%xmm6 + movaps -0x90(%rbp),%xmm7 + movaps -0x80(%rbp),%xmm8 + movaps -0x70(%rbp),%xmm9 + movaps -0x60(%rbp),%xmm10 + movaps -0x50(%rbp),%xmm11 + movaps -0x40(%rbp),%xmm12 + movaps -0x30(%rbp),%xmm13 + movaps -0x20(%rbp),%xmm14 + movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; + lea (%rbp),%rsp + pop %rbp +.Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks ___ @@ -1317,243 +1606,286 @@ ___ my @tweak=map("%xmm$_",(10..15)); my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); -my $frame_size = 0x68 + ($win64?160:0); +my $frame_size = 0x70 + ($win64?160:0); $code.=<<___; .globl aesni_xts_encrypt .type aesni_xts_encrypt,\@function,6 .align 16 aesni_xts_encrypt: - lea -$frame_size(%rsp),%rsp + lea (%rsp),%rax + push %rbp + sub \$$frame_size,%rsp + and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,0x60(%rsp) - movaps %xmm7,0x70(%rsp) - movaps %xmm8,0x80(%rsp) - movaps %xmm9,0x90(%rsp) - movaps %xmm10,0xa0(%rsp) - movaps %xmm11,0xb0(%rsp) - movaps %xmm12,0xc0(%rsp) - movaps %xmm13,0xd0(%rsp) - movaps %xmm14,0xe0(%rsp) - movaps %xmm15,0xf0(%rsp) + movaps %xmm6,-0xa8(%rax) + movaps %xmm7,-0x98(%rax) + movaps %xmm8,-0x88(%rax) + movaps %xmm9,-0x78(%rax) + movaps %xmm10,-0x68(%rax) + movaps %xmm11,-0x58(%rax) + movaps %xmm12,-0x48(%rax) + movaps %xmm13,-0x38(%rax) + movaps %xmm14,-0x28(%rax) + movaps %xmm15,-0x18(%rax) .Lxts_enc_body: ___ $code.=<<___; - movups ($ivp),@tweak[5] # load clear-text tweak + lea -8(%rax),%rbp + movups ($ivp),$inout0 # load clear-text tweak mov 240(%r8),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds ___ # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); + &aesni_generate1("enc",$key2,$rounds,$inout0); $code.=<<___; + $movkey ($key),$rndkey0 # zero round key mov $key,$key_ # backup $key mov $rnds_,$rounds # backup $rounds + shl \$4,$rnds_ mov $len,$len_ # backup $len and \$-16,$len + $movkey 16($key,$rnds_),$rndkey1 # last round key + movdqa .Lxts_magic(%rip),$twmask - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + movdqa $inout0,@tweak[5] + pshufd \$0x5f,$inout0,$twres + pxor $rndkey0,$rndkey1 ___ + # alternative tweak calculation algorithm is based on suggestions + # by Shay Gueron. psrad doesn't conflict with AES-NI instructions + # and should help in the future... for ($i=0;$i<4;$i++) { $code.=<<___; - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp + movdqa $twres,$twtmp + paddd $twres,$twres movdqa @tweak[5],@tweak[$i] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - pand $twmask,$twres # isolate carry and residue - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - pxor $twres,@tweak[5] + psrad \$31,$twtmp # broadcast upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + pxor $rndkey0,@tweak[$i] + pxor $twtmp,@tweak[5] ___ } $code.=<<___; + movdqa @tweak[5],@tweak[4] + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + pand $twmask,$twres + pxor $rndkey0,@tweak[4] + pxor $twres,@tweak[5] + movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] + sub \$16*6,$len jc .Lxts_enc_short - shr \$1,$rounds - sub \$1,$rounds - mov $rounds,$rnds_ + mov \$16+96,$rounds + lea 32($key_,$rnds_),$key # end of key schedule + sub %r10,%rax # twisted $rounds + $movkey 16($key_),$rndkey1 + mov %rax,%r10 # backup twisted $rounds + lea .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.align 16 +.align 32 .Lxts_enc_grandloop: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak movdqu `16*0`($inp),$inout0 # load input - pand $twmask,$twres # isolate carry and residue + movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor $twres,@tweak[5] - + pxor @tweak[0],$inout0 movdqu `16*2`($inp),$inout2 - pxor @tweak[0],$inout0 # input^=tweak - movdqu `16*3`($inp),$inout3 pxor @tweak[1],$inout1 - movdqu `16*4`($inp),$inout4 + aesenc $rndkey1,$inout0 + movdqu `16*3`($inp),$inout3 pxor @tweak[2],$inout2 - movdqu `16*5`($inp),$inout5 - lea `16*6`($inp),$inp + aesenc $rndkey1,$inout1 + movdqu `16*4`($inp),$inout4 pxor @tweak[3],$inout3 - $movkey ($key_),$rndkey0 + aesenc $rndkey1,$inout2 + movdqu `16*5`($inp),$inout5 + pxor @tweak[5],$twmask # round[0]^=tweak[5] + movdqa 0x60(%rsp),$twres # load round[0]^round[last] pxor @tweak[4],$inout4 - pxor @tweak[5],$inout5 + aesenc $rndkey1,$inout3 + $movkey 32($key_),$rndkey0 + lea `16*6`($inp),$inp + pxor $twmask,$inout5 - # inline _aesni_encrypt6 and interleave first and last rounds - # with own code... - $movkey 16($key_),$rndkey1 - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks - aesenc $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 + pxor $twres,@tweak[0] + aesenc $rndkey1,$inout4 + pxor $twres,@tweak[1] + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key + aesenc $rndkey1,$inout5 + $movkey 48($key_),$rndkey1 + pxor $twres,@tweak[2] + + aesenc $rndkey0,$inout0 + pxor $twres,@tweak[3] movdqa @tweak[1],`16*1`(%rsp) - aesenc $rndkey1,$inout1 - pxor $rndkey0,$inout3 + aesenc $rndkey0,$inout1 + pxor $twres,@tweak[4] movdqa @tweak[2],`16*2`(%rsp) - aesenc $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqa @tweak[3],`16*3`(%rsp) - aesenc $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + pxor $twres,$twmask movdqa @tweak[4],`16*4`(%rsp) - aesenc $rndkey1,$inout4 - movdqa @tweak[5],`16*5`(%rsp) - aesenc $rndkey1,$inout5 - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp - jmp .Lxts_enc_loop6_enter - -.align 16 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + movdqa $twmask,`16*5`(%rsp) + pshufd \$0x5f,@tweak[5],$twres + jmp .Lxts_enc_loop6 +.align 32 .Lxts_enc_loop6: aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - dec $rounds aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 -.Lxts_enc_loop6_enter: - $movkey 16($key),$rndkey1 + $movkey -64($key,%rax),$rndkey1 + add \$32,%rax + aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 - lea 32($key),$key aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -80($key,%rax),$rndkey0 jnz .Lxts_enc_loop6 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa (%r8),$twmask + movdqa $twres,$twtmp + paddd $twres,$twres aesenc $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + paddq @tweak[5],@tweak[5] + psrad \$31,$twtmp aesenc $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + pand $twmask,$twtmp + $movkey ($key_),@tweak[0] # load round[0] aesenc $rndkey1,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[0],@tweak[1] # copy round[0] aesenc $rndkey1,$inout5 - $movkey 16($key),$rndkey1 + $movkey -64($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[0] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa $twres,$twtmp aesenc $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue + paddd $twres,$twres + pxor @tweak[5],@tweak[0] aesenc $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + psrad \$31,$twtmp + paddq @tweak[5],@tweak[5] aesenc $rndkey0,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey0,$inout3 + pand $twmask,$twtmp + movaps @tweak[1],@tweak[2] aesenc $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movdqa $twres,$twtmp aesenc $rndkey0,$inout5 - $movkey 32($key),$rndkey0 + $movkey -48($key),$rndkey0 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[1] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + paddd $twres,$twres aesenc $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + pxor @tweak[5],@tweak[1] + psrad \$31,$twtmp aesenc $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp aesenc $rndkey1,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey1,$inout3 + movdqa @tweak[3],`16*3`(%rsp) + pxor $twtmp,@tweak[5] aesenc $rndkey1,$inout4 + movaps @tweak[2],@tweak[3] + movdqa $twres,$twtmp aesenc $rndkey1,$inout5 + $movkey -32($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[2] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - aesenclast $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue - aesenclast $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - aesenclast $rndkey0,$inout2 - pxor $twres,@tweak[5] - aesenclast $rndkey0,$inout3 - aesenclast $rndkey0,$inout4 - aesenclast $rndkey0,$inout5 - - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[3] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - xorps `16*0`(%rsp),$inout0 # output^=tweak - pand $twmask,$twres # isolate carry and residue - xorps `16*1`(%rsp),$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddd $twres,$twres + aesenc $rndkey0,$inout0 + pxor @tweak[5],@tweak[2] + psrad \$31,$twtmp + aesenc $rndkey0,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[3],@tweak[4] + aesenc $rndkey0,$inout5 + + movdqa $twres,$rndkey0 + paddd $twres,$twres + aesenc $rndkey1,$inout0 + pxor @tweak[5],@tweak[3] + psrad \$31,$rndkey0 + aesenc $rndkey1,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$rndkey0 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + pxor $rndkey0,@tweak[5] + $movkey ($key_),$rndkey0 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + + pxor @tweak[5],@tweak[4] + aesenclast `16*0`(%rsp),$inout0 + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + aesenclast `16*1`(%rsp),$inout1 + aesenclast `16*2`(%rsp),$inout2 + pand $twmask,$twres + mov %r10,%rax # restore $rounds + aesenclast `16*3`(%rsp),$inout3 + aesenclast `16*4`(%rsp),$inout4 + aesenclast `16*5`(%rsp),$inout5 pxor $twres,@tweak[5] - xorps `16*2`(%rsp),$inout2 - movups $inout0,`16*0`($out) # write output - xorps `16*3`(%rsp),$inout3 - movups $inout1,`16*1`($out) - xorps `16*4`(%rsp),$inout4 - movups $inout2,`16*2`($out) - xorps `16*5`(%rsp),$inout5 - movups $inout3,`16*3`($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,`16*4`($out) - movups $inout5,`16*5`($out) lea `16*6`($out),$out + movups $inout0,`-16*6`($out) # write output + movups $inout1,`-16*5`($out) + movups $inout2,`-16*4`($out) + movups $inout3,`-16*3`($out) + movups $inout4,`-16*2`($out) + movups $inout5,`-16*1`($out) sub \$16*6,$len jnc .Lxts_enc_grandloop - lea 3($rounds,$rounds),$rounds # restore original value + mov \$16+96,$rounds + sub $rnds_,$rounds mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds + shr \$4,$rounds # restore original value .Lxts_enc_short: + mov $rounds,$rnds_ # backup $rounds + pxor $rndkey0,@tweak[0] add \$16*6,$len jz .Lxts_enc_done + pxor $rndkey0,@tweak[1] cmp \$0x20,$len jb .Lxts_enc_one + pxor $rndkey0,@tweak[2] je .Lxts_enc_two + pxor $rndkey0,@tweak[3] cmp \$0x40,$len jb .Lxts_enc_three + pxor $rndkey0,@tweak[4] je .Lxts_enc_four - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movdqu ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movdqu 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movdqu ($inp),$inout0 + movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 @@ -1602,7 +1934,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -1648,15 +1980,15 @@ $code.=<<___; call _aesni_encrypt4 - xorps @tweak[0],$inout0 - movdqa @tweak[5],@tweak[0] - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - movups $inout0,($out) - xorps @tweak[3],$inout3 - movups $inout1,16*1($out) - movups $inout2,16*2($out) - movups $inout3,16*3($out) + pxor @tweak[0],$inout0 + movdqa @tweak[4],@tweak[0] + pxor @tweak[1],$inout1 + pxor @tweak[2],$inout2 + movdqu $inout0,($out) + pxor @tweak[3],$inout3 + movdqu $inout1,16*1($out) + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) lea 16*4($out),$out jmp .Lxts_enc_done @@ -1691,19 +2023,20 @@ $code.=<<___; .Lxts_enc_ret: ___ $code.=<<___ if ($win64); - movaps 0x60(%rsp),%xmm6 - movaps 0x70(%rsp),%xmm7 - movaps 0x80(%rsp),%xmm8 - movaps 0x90(%rsp),%xmm9 - movaps 0xa0(%rsp),%xmm10 - movaps 0xb0(%rsp),%xmm11 - movaps 0xc0(%rsp),%xmm12 - movaps 0xd0(%rsp),%xmm13 - movaps 0xe0(%rsp),%xmm14 - movaps 0xf0(%rsp),%xmm15 + movaps -0xa0(%rbp),%xmm6 + movaps -0x90(%rbp),%xmm7 + movaps -0x80(%rbp),%xmm8 + movaps -0x70(%rbp),%xmm9 + movaps -0x60(%rbp),%xmm10 + movaps -0x50(%rbp),%xmm11 + movaps -0x40(%rbp),%xmm12 + movaps -0x30(%rbp),%xmm13 + movaps -0x20(%rbp),%xmm14 + movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; - lea $frame_size(%rsp),%rsp + lea (%rbp),%rsp + pop %rbp .Lxts_enc_epilogue: ret .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -1714,28 +2047,32 @@ $code.=<<___; .type aesni_xts_decrypt,\@function,6 .align 16 aesni_xts_decrypt: - lea -$frame_size(%rsp),%rsp + lea (%rsp),%rax + push %rbp + sub \$$frame_size,%rsp + and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,0x60(%rsp) - movaps %xmm7,0x70(%rsp) - movaps %xmm8,0x80(%rsp) - movaps %xmm9,0x90(%rsp) - movaps %xmm10,0xa0(%rsp) - movaps %xmm11,0xb0(%rsp) - movaps %xmm12,0xc0(%rsp) - movaps %xmm13,0xd0(%rsp) - movaps %xmm14,0xe0(%rsp) - movaps %xmm15,0xf0(%rsp) + movaps %xmm6,-0xa8(%rax) + movaps %xmm7,-0x98(%rax) + movaps %xmm8,-0x88(%rax) + movaps %xmm9,-0x78(%rax) + movaps %xmm10,-0x68(%rax) + movaps %xmm11,-0x58(%rax) + movaps %xmm12,-0x48(%rax) + movaps %xmm13,-0x38(%rax) + movaps %xmm14,-0x28(%rax) + movaps %xmm15,-0x18(%rax) .Lxts_dec_body: ___ $code.=<<___; - movups ($ivp),@tweak[5] # load clear-text tweak + lea -8(%rax),%rbp + movups ($ivp),$inout0 # load clear-text tweak mov 240($key2),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds ___ # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); + &aesni_generate1("enc",$key2,$rounds,$inout0); $code.=<<___; xor %eax,%eax # if ($len%16) len-=16; test \$15,$len @@ -1743,213 +2080,249 @@ $code.=<<___; shl \$4,%rax sub %rax,$len + $movkey ($key),$rndkey0 # zero round key mov $key,$key_ # backup $key mov $rnds_,$rounds # backup $rounds + shl \$4,$rnds_ mov $len,$len_ # backup $len and \$-16,$len + $movkey 16($key,$rnds_),$rndkey1 # last round key + movdqa .Lxts_magic(%rip),$twmask - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + movdqa $inout0,@tweak[5] + pshufd \$0x5f,$inout0,$twres + pxor $rndkey0,$rndkey1 ___ for ($i=0;$i<4;$i++) { $code.=<<___; - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp + movdqa $twres,$twtmp + paddd $twres,$twres movdqa @tweak[5],@tweak[$i] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - pand $twmask,$twres # isolate carry and residue - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - pxor $twres,@tweak[5] + psrad \$31,$twtmp # broadcast upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + pxor $rndkey0,@tweak[$i] + pxor $twtmp,@tweak[5] ___ } $code.=<<___; + movdqa @tweak[5],@tweak[4] + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + pand $twmask,$twres + pxor $rndkey0,@tweak[4] + pxor $twres,@tweak[5] + movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] + sub \$16*6,$len jc .Lxts_dec_short - shr \$1,$rounds - sub \$1,$rounds - mov $rounds,$rnds_ + mov \$16+96,$rounds + lea 32($key_,$rnds_),$key # end of key schedule + sub %r10,%rax # twisted $rounds + $movkey 16($key_),$rndkey1 + mov %rax,%r10 # backup twisted $rounds + lea .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.align 16 +.align 32 .Lxts_dec_grandloop: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak movdqu `16*0`($inp),$inout0 # load input - pand $twmask,$twres # isolate carry and residue + movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor $twres,@tweak[5] - + pxor @tweak[0],$inout0 movdqu `16*2`($inp),$inout2 - pxor @tweak[0],$inout0 # input^=tweak - movdqu `16*3`($inp),$inout3 pxor @tweak[1],$inout1 - movdqu `16*4`($inp),$inout4 + aesdec $rndkey1,$inout0 + movdqu `16*3`($inp),$inout3 pxor @tweak[2],$inout2 - movdqu `16*5`($inp),$inout5 - lea `16*6`($inp),$inp + aesdec $rndkey1,$inout1 + movdqu `16*4`($inp),$inout4 pxor @tweak[3],$inout3 - $movkey ($key_),$rndkey0 + aesdec $rndkey1,$inout2 + movdqu `16*5`($inp),$inout5 + pxor @tweak[5],$twmask # round[0]^=tweak[5] + movdqa 0x60(%rsp),$twres # load round[0]^round[last] pxor @tweak[4],$inout4 - pxor @tweak[5],$inout5 + aesdec $rndkey1,$inout3 + $movkey 32($key_),$rndkey0 + lea `16*6`($inp),$inp + pxor $twmask,$inout5 - # inline _aesni_decrypt6 and interleave first and last rounds - # with own code... - $movkey 16($key_),$rndkey1 - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks - aesdec $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 - movdqa @tweak[1],`16*1`(%rsp) - aesdec $rndkey1,$inout1 - pxor $rndkey0,$inout3 - movdqa @tweak[2],`16*2`(%rsp) - aesdec $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqa @tweak[3],`16*3`(%rsp) - aesdec $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - movdqa @tweak[4],`16*4`(%rsp) + pxor $twres,@tweak[0] aesdec $rndkey1,$inout4 - movdqa @tweak[5],`16*5`(%rsp) + pxor $twres,@tweak[1] + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key aesdec $rndkey1,$inout5 - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp - jmp .Lxts_dec_loop6_enter + $movkey 48($key_),$rndkey1 + pxor $twres,@tweak[2] -.align 16 + aesdec $rndkey0,$inout0 + pxor $twres,@tweak[3] + movdqa @tweak[1],`16*1`(%rsp) + aesdec $rndkey0,$inout1 + pxor $twres,@tweak[4] + movdqa @tweak[2],`16*2`(%rsp) + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + pxor $twres,$twmask + movdqa @tweak[4],`16*4`(%rsp) + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + movdqa $twmask,`16*5`(%rsp) + pshufd \$0x5f,@tweak[5],$twres + jmp .Lxts_dec_loop6 +.align 32 .Lxts_dec_loop6: aesdec $rndkey1,$inout0 aesdec $rndkey1,$inout1 - dec $rounds aesdec $rndkey1,$inout2 aesdec $rndkey1,$inout3 aesdec $rndkey1,$inout4 aesdec $rndkey1,$inout5 -.Lxts_dec_loop6_enter: - $movkey 16($key),$rndkey1 + $movkey -64($key,%rax),$rndkey1 + add \$32,%rax + aesdec $rndkey0,$inout0 aesdec $rndkey0,$inout1 - lea 32($key),$key aesdec $rndkey0,$inout2 aesdec $rndkey0,$inout3 aesdec $rndkey0,$inout4 aesdec $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -80($key,%rax),$rndkey0 jnz .Lxts_dec_loop6 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa (%r8),$twmask + movdqa $twres,$twtmp + paddd $twres,$twres aesdec $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + paddq @tweak[5],@tweak[5] + psrad \$31,$twtmp aesdec $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + pand $twmask,$twtmp + $movkey ($key_),@tweak[0] # load round[0] aesdec $rndkey1,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey1,$inout3 aesdec $rndkey1,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[0],@tweak[1] # copy round[0] aesdec $rndkey1,$inout5 - $movkey 16($key),$rndkey1 + $movkey -64($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[0] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa $twres,$twtmp aesdec $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue + paddd $twres,$twres + pxor @tweak[5],@tweak[0] aesdec $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + psrad \$31,$twtmp + paddq @tweak[5],@tweak[5] aesdec $rndkey0,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey0,$inout3 + pand $twmask,$twtmp + movaps @tweak[1],@tweak[2] aesdec $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movdqa $twres,$twtmp aesdec $rndkey0,$inout5 - $movkey 32($key),$rndkey0 + $movkey -48($key),$rndkey0 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[1] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + paddd $twres,$twres aesdec $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + pxor @tweak[5],@tweak[1] + psrad \$31,$twtmp aesdec $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp aesdec $rndkey1,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey1,$inout3 + movdqa @tweak[3],`16*3`(%rsp) + pxor $twtmp,@tweak[5] aesdec $rndkey1,$inout4 + movaps @tweak[2],@tweak[3] + movdqa $twres,$twtmp aesdec $rndkey1,$inout5 + $movkey -32($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[2] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - aesdeclast $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue - aesdeclast $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - aesdeclast $rndkey0,$inout2 - pxor $twres,@tweak[5] - aesdeclast $rndkey0,$inout3 - aesdeclast $rndkey0,$inout4 - aesdeclast $rndkey0,$inout5 - - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[3] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - xorps `16*0`(%rsp),$inout0 # output^=tweak - pand $twmask,$twres # isolate carry and residue - xorps `16*1`(%rsp),$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddd $twres,$twres + aesdec $rndkey0,$inout0 + pxor @tweak[5],@tweak[2] + psrad \$31,$twtmp + aesdec $rndkey0,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[3],@tweak[4] + aesdec $rndkey0,$inout5 + + movdqa $twres,$rndkey0 + paddd $twres,$twres + aesdec $rndkey1,$inout0 + pxor @tweak[5],@tweak[3] + psrad \$31,$rndkey0 + aesdec $rndkey1,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$rndkey0 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0,@tweak[5] + $movkey ($key_),$rndkey0 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + + pxor @tweak[5],@tweak[4] + aesdeclast `16*0`(%rsp),$inout0 + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + aesdeclast `16*1`(%rsp),$inout1 + aesdeclast `16*2`(%rsp),$inout2 + pand $twmask,$twres + mov %r10,%rax # restore $rounds + aesdeclast `16*3`(%rsp),$inout3 + aesdeclast `16*4`(%rsp),$inout4 + aesdeclast `16*5`(%rsp),$inout5 pxor $twres,@tweak[5] - xorps `16*2`(%rsp),$inout2 - movups $inout0,`16*0`($out) # write output - xorps `16*3`(%rsp),$inout3 - movups $inout1,`16*1`($out) - xorps `16*4`(%rsp),$inout4 - movups $inout2,`16*2`($out) - xorps `16*5`(%rsp),$inout5 - movups $inout3,`16*3`($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,`16*4`($out) - movups $inout5,`16*5`($out) lea `16*6`($out),$out + movups $inout0,`-16*6`($out) # write output + movups $inout1,`-16*5`($out) + movups $inout2,`-16*4`($out) + movups $inout3,`-16*3`($out) + movups $inout4,`-16*2`($out) + movups $inout5,`-16*1`($out) sub \$16*6,$len jnc .Lxts_dec_grandloop - lea 3($rounds,$rounds),$rounds # restore original value + mov \$16+96,$rounds + sub $rnds_,$rounds mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds + shr \$4,$rounds # restore original value .Lxts_dec_short: + mov $rounds,$rnds_ # backup $rounds + pxor $rndkey0,@tweak[0] + pxor $rndkey0,@tweak[1] add \$16*6,$len jz .Lxts_dec_done + pxor $rndkey0,@tweak[2] cmp \$0x20,$len jb .Lxts_dec_one + pxor $rndkey0,@tweak[3] je .Lxts_dec_two + pxor $rndkey0,@tweak[4] cmp \$0x40,$len jb .Lxts_dec_three je .Lxts_dec_four - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movdqu ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movdqu 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movdqu ($inp),$inout0 + movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 @@ -2008,7 +2381,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -2034,7 +2407,7 @@ $code.=<<___; xorps @tweak[0],$inout0 movdqa @tweak[3],@tweak[0] xorps @tweak[1],$inout1 - movdqa @tweak[5],@tweak[1] + movdqa @tweak[4],@tweak[1] xorps @tweak[2],$inout2 movups $inout0,($out) movups $inout1,16*1($out) @@ -2044,14 +2417,8 @@ $code.=<<___; .align 16 .Lxts_dec_four: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movups ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movups 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movups ($inp),$inout0 + movups 16*1($inp),$inout1 movups 16*2($inp),$inout2 xorps @tweak[0],$inout0 movups 16*3($inp),$inout3 @@ -2062,16 +2429,16 @@ $code.=<<___; call _aesni_decrypt4 - xorps @tweak[0],$inout0 + pxor @tweak[0],$inout0 movdqa @tweak[4],@tweak[0] - xorps @tweak[1],$inout1 + pxor @tweak[1],$inout1 movdqa @tweak[5],@tweak[1] - xorps @tweak[2],$inout2 - movups $inout0,($out) - xorps @tweak[3],$inout3 - movups $inout1,16*1($out) - movups $inout2,16*2($out) - movups $inout3,16*3($out) + pxor @tweak[2],$inout2 + movdqu $inout0,($out) + pxor @tweak[3],$inout3 + movdqu $inout1,16*1($out) + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) lea 16*4($out),$out jmp .Lxts_dec_done @@ -2117,19 +2484,20 @@ $code.=<<___; .Lxts_dec_ret: ___ $code.=<<___ if ($win64); - movaps 0x60(%rsp),%xmm6 - movaps 0x70(%rsp),%xmm7 - movaps 0x80(%rsp),%xmm8 - movaps 0x90(%rsp),%xmm9 - movaps 0xa0(%rsp),%xmm10 - movaps 0xb0(%rsp),%xmm11 - movaps 0xc0(%rsp),%xmm12 - movaps 0xd0(%rsp),%xmm13 - movaps 0xe0(%rsp),%xmm14 - movaps 0xf0(%rsp),%xmm15 + movaps -0xa0(%rbp),%xmm6 + movaps -0x90(%rbp),%xmm7 + movaps -0x80(%rbp),%xmm8 + movaps -0x70(%rbp),%xmm9 + movaps -0x60(%rbp),%xmm10 + movaps -0x50(%rbp),%xmm11 + movaps -0x40(%rbp),%xmm12 + movaps -0x30(%rbp),%xmm13 + movaps -0x20(%rbp),%xmm14 + movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; - lea $frame_size(%rsp),%rsp + lea (%rbp),%rsp + pop %rbp .Lxts_dec_epilogue: ret .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2141,7 +2509,10 @@ ___ # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); { -my $reserved = $win64?0x40:-0x18; # used in decrypt +my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt +my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); +my $inp_=$key_; + $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -2197,276 +2568,398 @@ $code.=<<___; #--------------------------- CBC DECRYPT ------------------------------# .align 16 .Lcbc_decrypt: + lea (%rsp),%rax + push %rbp + sub \$$frame_size,%rsp + and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) .Lcbc_decrypt_body: ___ $code.=<<___; + lea -8(%rax),%rbp movups ($ivp),$iv mov $rnds_,$rounds - cmp \$0x70,$len + cmp \$0x50,$len jbe .Lcbc_dec_tail - shr \$1,$rnds_ - sub \$0x70,$len - mov $rnds_,$rounds - movaps $iv,$reserved(%rsp) + + $movkey ($key),$rndkey0 + movdqu 0x00($inp),$inout0 # load input + movdqu 0x10($inp),$inout1 + movdqa $inout0,$in0 + movdqu 0x20($inp),$inout2 + movdqa $inout1,$in1 + movdqu 0x30($inp),$inout3 + movdqa $inout2,$in2 + movdqu 0x40($inp),$inout4 + movdqa $inout3,$in3 + movdqu 0x50($inp),$inout5 + movdqa $inout4,$in4 + mov OPENSSL_ia32cap_P+4(%rip),%r9d + cmp \$0x70,$len + jbe .Lcbc_dec_six_or_seven + + and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE + sub \$0x50,$len + cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE + je .Lcbc_dec_loop6_enter + sub \$0x20,$len + lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 .Lcbc_dec_loop8: - movaps $rndkey0,$reserved(%rsp) # save IV movups $inout7,($out) lea 0x10($out),$out .Lcbc_dec_loop8_enter: - $movkey ($key),$rndkey0 - movups ($inp),$inout0 # load input - movups 0x10($inp),$inout1 - $movkey 16($key),$rndkey1 + movdqu 0x60($inp),$inout6 + pxor $rndkey0,$inout0 + movdqu 0x70($inp),$inout7 + pxor $rndkey0,$inout1 + $movkey 0x10-0x70($key),$rndkey1 + pxor $rndkey0,$inout2 + xor $inp_,$inp_ + cmp \$0x70,$len # is there at least 0x60 bytes ahead? + pxor $rndkey0,$inout3 + pxor $rndkey0,$inout4 + pxor $rndkey0,$inout5 + pxor $rndkey0,$inout6 - lea 32($key),$key - movdqu 0x20($inp),$inout2 - xorps $rndkey0,$inout0 - movdqu 0x30($inp),$inout3 - xorps $rndkey0,$inout1 - movdqu 0x40($inp),$inout4 aesdec $rndkey1,$inout0 - pxor $rndkey0,$inout2 - movdqu 0x50($inp),$inout5 + pxor $rndkey0,$inout7 + $movkey 0x20-0x70($key),$rndkey0 aesdec $rndkey1,$inout1 - pxor $rndkey0,$inout3 - movdqu 0x60($inp),$inout6 aesdec $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqu 0x70($inp),$inout7 aesdec $rndkey1,$inout3 - pxor $rndkey0,$inout5 - dec $rounds aesdec $rndkey1,$inout4 - pxor $rndkey0,$inout6 aesdec $rndkey1,$inout5 - pxor $rndkey0,$inout7 - $movkey ($key),$rndkey0 aesdec $rndkey1,$inout6 + setnc ${inp_}b + shl \$7,$inp_ aesdec $rndkey1,$inout7 - $movkey 16($key),$rndkey1 - - call .Ldec_loop8_enter + add $inp,$inp_ + $movkey 0x30-0x70($key),$rndkey1 +___ +for($i=1;$i<12;$i++) { +my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; +$code.=<<___ if ($i==7); + cmp \$11,$rounds +___ +$code.=<<___; + aesdec $rndkeyx,$inout0 + aesdec $rndkeyx,$inout1 + aesdec $rndkeyx,$inout2 + aesdec $rndkeyx,$inout3 + aesdec $rndkeyx,$inout4 + aesdec $rndkeyx,$inout5 + aesdec $rndkeyx,$inout6 + aesdec $rndkeyx,$inout7 + $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx +___ +$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); + nop +___ +$code.=<<___ if ($i==7); + jb .Lcbc_dec_done +___ +$code.=<<___ if ($i==9); + je .Lcbc_dec_done +___ +$code.=<<___ if ($i==11); + jmp .Lcbc_dec_done +___ +} +$code.=<<___; +.align 16 +.Lcbc_dec_done: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + pxor $rndkey0,$iv + pxor $rndkey0,$in0 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0,$in1 + pxor $rndkey0,$in2 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + pxor $rndkey0,$in3 + pxor $rndkey0,$in4 + aesdec $rndkey1,$inout6 + aesdec $rndkey1,$inout7 + movdqu 0x50($inp),$rndkey1 + + aesdeclast $iv,$inout0 + movdqu 0x60($inp),$iv # borrow $iv + pxor $rndkey0,$rndkey1 + aesdeclast $in0,$inout1 + pxor $rndkey0,$iv + movdqu 0x70($inp),$rndkey0 # next IV + aesdeclast $in1,$inout2 + lea 0x80($inp),$inp + movdqu 0x00($inp_),$in0 + aesdeclast $in2,$inout3 + aesdeclast $in3,$inout4 + movdqu 0x10($inp_),$in1 + movdqu 0x20($inp_),$in2 + aesdeclast $in4,$inout5 + aesdeclast $rndkey1,$inout6 + movdqu 0x30($inp_),$in3 + movdqu 0x40($inp_),$in4 + aesdeclast $iv,$inout7 + movdqa $rndkey0,$iv # return $iv + movdqu 0x50($inp_),$rndkey1 + $movkey -0x70($key),$rndkey0 + + movups $inout0,($out) # store output + movdqa $in0,$inout0 + movups $inout1,0x10($out) + movdqa $in1,$inout1 + movups $inout2,0x20($out) + movdqa $in2,$inout2 + movups $inout3,0x30($out) + movdqa $in3,$inout3 + movups $inout4,0x40($out) + movdqa $in4,$inout4 + movups $inout5,0x50($out) + movdqa $rndkey1,$inout5 + movups $inout6,0x60($out) + lea 0x70($out),$out - movups ($inp),$rndkey1 # re-load input - movups 0x10($inp),$rndkey0 - xorps $reserved(%rsp),$inout0 # ^= IV - xorps $rndkey1,$inout1 - movups 0x20($inp),$rndkey1 - xorps $rndkey0,$inout2 - movups 0x30($inp),$rndkey0 - xorps $rndkey1,$inout3 - movups 0x40($inp),$rndkey1 - xorps $rndkey0,$inout4 - movups 0x50($inp),$rndkey0 - xorps $rndkey1,$inout5 - movups 0x60($inp),$rndkey1 - xorps $rndkey0,$inout6 - movups 0x70($inp),$rndkey0 # IV - xorps $rndkey1,$inout7 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,0x40($out) - mov $key_,$key # restore $key - movups $inout5,0x50($out) - lea 0x80($inp),$inp - movups $inout6,0x60($out) - lea 0x70($out),$out sub \$0x80,$len ja .Lcbc_dec_loop8 movaps $inout7,$inout0 - movaps $rndkey0,$iv + lea -0x70($key),$key add \$0x70,$len jle .Lcbc_dec_tail_collected - movups $inout0,($out) - lea 1($rnds_,$rnds_),$rounds + movups $inout7,($out) lea 0x10($out),$out + cmp \$0x50,$len + jbe .Lcbc_dec_tail + + movaps $in0,$inout0 +.Lcbc_dec_six_or_seven: + cmp \$0x60,$len + ja .Lcbc_dec_seven + + movaps $inout5,$inout6 + call _aesni_decrypt6 + pxor $iv,$inout0 # ^= IV + movaps $inout6,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + movdqu $inout4,0x40($out) + lea 0x50($out),$out + movdqa $inout5,$inout0 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 0x60($inp),$inout6 + xorps $inout7,$inout7 + call _aesni_decrypt8 + movups 0x50($inp),$inout7 + pxor $iv,$inout0 # ^= IV + movups 0x60($inp),$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + movdqu $inout4,0x40($out) + pxor $inout7,$inout6 + movdqu $inout5,0x50($out) + lea 0x60($out),$out + movdqa $inout6,$inout0 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_loop6: + movups $inout5,($out) + lea 0x10($out),$out + movdqu 0x00($inp),$inout0 # load input + movdqu 0x10($inp),$inout1 + movdqa $inout0,$in0 + movdqu 0x20($inp),$inout2 + movdqa $inout1,$in1 + movdqu 0x30($inp),$inout3 + movdqa $inout2,$in2 + movdqu 0x40($inp),$inout4 + movdqa $inout3,$in3 + movdqu 0x50($inp),$inout5 + movdqa $inout4,$in4 +.Lcbc_dec_loop6_enter: + lea 0x60($inp),$inp + movdqa $inout5,$inout6 + + call _aesni_decrypt6 + + pxor $iv,$inout0 # ^= IV + movdqa $inout6,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + mov $key_,$key + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + mov $rnds_,$rounds + movdqu $inout4,0x40($out) + lea 0x50($out),$out + sub \$0x60,$len + ja .Lcbc_dec_loop6 + + movdqa $inout5,$inout0 + add \$0x50,$len + jle .Lcbc_dec_tail_collected + movups $inout5,($out) + lea 0x10($out),$out + .Lcbc_dec_tail: movups ($inp),$inout0 - movaps $inout0,$in0 - cmp \$0x10,$len + sub \$0x10,$len jbe .Lcbc_dec_one movups 0x10($inp),$inout1 - movaps $inout1,$in1 - cmp \$0x20,$len + movaps $inout0,$in0 + sub \$0x10,$len jbe .Lcbc_dec_two movups 0x20($inp),$inout2 - movaps $inout2,$in2 - cmp \$0x30,$len + movaps $inout1,$in1 + sub \$0x10,$len jbe .Lcbc_dec_three movups 0x30($inp),$inout3 - cmp \$0x40,$len + movaps $inout2,$in2 + sub \$0x10,$len jbe .Lcbc_dec_four movups 0x40($inp),$inout4 - cmp \$0x50,$len - jbe .Lcbc_dec_five - - movups 0x50($inp),$inout5 - cmp \$0x60,$len - jbe .Lcbc_dec_six - - movups 0x60($inp),$inout6 - movaps $iv,$reserved(%rsp) # save IV - call _aesni_decrypt8 - movups ($inp),$rndkey1 - movups 0x10($inp),$rndkey0 - xorps $reserved(%rsp),$inout0 # ^= IV - xorps $rndkey1,$inout1 - movups 0x20($inp),$rndkey1 - xorps $rndkey0,$inout2 - movups 0x30($inp),$rndkey0 - xorps $rndkey1,$inout3 - movups 0x40($inp),$rndkey1 - xorps $rndkey0,$inout4 - movups 0x50($inp),$rndkey0 - xorps $rndkey1,$inout5 - movups 0x60($inp),$iv # IV - xorps $rndkey0,$inout6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - lea 0x60($out),$out - movaps $inout6,$inout0 - sub \$0x70,$len + movaps $inout3,$in3 + movaps $inout4,$in4 + xorps $inout5,$inout5 + call _aesni_decrypt6 + pxor $iv,$inout0 + movaps $in4,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + lea 0x40($out),$out + movdqa $inout4,$inout0 + sub \$0x10,$len jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_one: + movaps $inout0,$in0 ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; xorps $iv,$inout0 movaps $in0,$iv - sub \$0x10,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 - xorps $iv,$inout0 - xorps $in0,$inout1 - movups $inout0,($out) + movaps $inout1,$in1 + call _aesni_decrypt2 + pxor $iv,$inout0 movaps $in1,$iv - movaps $inout1,$inout0 + pxor $in0,$inout1 + movdqu $inout0,($out) + movdqa $inout1,$inout0 lea 0x10($out),$out - sub \$0x20,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: + movaps $inout2,$in2 call _aesni_decrypt3 - xorps $iv,$inout0 - xorps $in0,$inout1 - movups $inout0,($out) - xorps $in1,$inout2 - movups $inout1,0x10($out) + pxor $iv,$inout0 movaps $in2,$iv - movaps $inout2,$inout0 + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + movdqa $inout2,$inout0 lea 0x20($out),$out - sub \$0x30,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_four: + movaps $inout3,$in3 call _aesni_decrypt4 - xorps $iv,$inout0 - movups 0x30($inp),$iv - xorps $in0,$inout1 - movups $inout0,($out) - xorps $in1,$inout2 - movups $inout1,0x10($out) - xorps $in2,$inout3 - movups $inout2,0x20($out) - movaps $inout3,$inout0 + pxor $iv,$inout0 + movaps $in3,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + movdqa $inout3,$inout0 lea 0x30($out),$out - sub \$0x40,$len - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - xorps $inout5,$inout5 - call _aesni_decrypt6 - movups 0x10($inp),$rndkey1 - movups 0x20($inp),$rndkey0 - xorps $iv,$inout0 - xorps $in0,$inout1 - xorps $rndkey1,$inout2 - movups 0x30($inp),$rndkey1 - xorps $rndkey0,$inout3 - movups 0x40($inp),$iv - xorps $rndkey1,$inout4 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - lea 0x40($out),$out - movaps $inout4,$inout0 - sub \$0x50,$len - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 0x10($inp),$rndkey1 - movups 0x20($inp),$rndkey0 - xorps $iv,$inout0 - xorps $in0,$inout1 - xorps $rndkey1,$inout2 - movups 0x30($inp),$rndkey1 - xorps $rndkey0,$inout3 - movups 0x40($inp),$rndkey0 - xorps $rndkey1,$inout4 - movups 0x50($inp),$iv - xorps $rndkey0,$inout5 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - lea 0x50($out),$out - movaps $inout5,$inout0 - sub \$0x60,$len jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_tail_collected: - and \$15,$len movups $iv,($ivp) + and \$15,$len jnz .Lcbc_dec_tail_partial movups $inout0,($out) jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: - movaps $inout0,$reserved(%rsp) + movaps $inout0,(%rsp) mov \$16,%rcx mov $out,%rdi sub $len,%rcx - lea $reserved(%rsp),%rsi + lea (%rsp),%rsi .long 0x9066A4F3 # rep movsb .Lcbc_dec_ret: ___ $code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - lea 0x58(%rsp),%rsp + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 ___ $code.=<<___; + lea (%rbp),%rsp + pop %rbp .Lcbc_ret: ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt @@ -2733,6 +3226,8 @@ $code.=<<___; .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 @@ -2791,45 +3286,9 @@ ecb_ccm64_se_handler: jmp .Lcommon_seh_tail .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler -.type ctr32_se_handler,\@abi-omnipotent +.type ctr_xts_se_handler,\@abi-omnipotent .align 16 -ctr32_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - lea .Lctr32_body(%rip),%r10 - cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - lea .Lctr32_ret(%rip),%r10 - cmp %r10,%rbx - jae .Lcommon_seh_tail - - lea 0x20(%rax),%rsi # %xmm save area - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - lea 0xc8(%rax),%rax # adjust stack pointer - - jmp .Lcommon_seh_tail -.size ctr32_se_handler,.-ctr32_se_handler - -.type xts_se_handler,\@abi-omnipotent -.align 16 -xts_se_handler: +ctr_xts_se_handler: push %rsi push %rdi push %rbx @@ -2859,14 +3318,14 @@ xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - lea 0x60(%rax),%rsi # %xmm save area + mov 160($context),%rax # pull context->Rbp + lea -0xa0(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0x68+160(%rax),%rax # adjust stack pointer - jmp .Lcommon_seh_tail -.size xts_se_handler,.-xts_se_handler + jmp .Lcommon_rbp_tail +.size ctr_xts_se_handler,.-ctr_xts_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent @@ -2898,11 +3357,16 @@ cbc_se_handler: cmp %r10,%rbx # context->Rip>="epilogue" label jae .Lcommon_seh_tail - lea 0(%rax),%rsi # top of stack + lea 16(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 - mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0x58(%rax),%rax # adjust stack pointer + +.Lcommon_rbp_tail: + mov 160($context),%rax # pull context->Rbp + mov (%rax),%rbp # restore saved %rbp + lea 8(%rax),%rax # adjust stack pointer + mov %rbp,160($context) # restore context->Rbp jmp .Lcommon_seh_tail .Lrestore_cbc_rax: @@ -3006,14 +3470,15 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] .LSEH_info_ctr32: .byte 9,0,0,0 - .rva ctr32_se_handler + .rva ctr_xts_se_handler + .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] .LSEH_info_xts_enc: .byte 9,0,0,0 - .rva xts_se_handler + .rva ctr_xts_se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] .LSEH_info_xts_dec: .byte 9,0,0,0 - .rva xts_se_handler + .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] ___ $code.=<<___; @@ -3060,11 +3525,30 @@ sub aesni { push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } return $line; } +sub movbe { + ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact +$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code; diff --git a/openssl/crypto/aes/asm/aesp8-ppc.pl b/openssl/crypto/aes/asm/aesp8-ppc.pl new file mode 100755 index 000000000..a1891cc03 --- /dev/null +++ b/openssl/crypto/aes/asm/aesp8-ppc.pl @@ -0,0 +1,1942 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for AES instructions as per PowerISA +# specification version 2.07, first implemented by POWER8 processor. +# The module is endian-agnostic in sense that it supports both big- +# and little-endian cases. Data alignment in parallelizable modes is +# handled with VSX loads and stores, which implies MSR.VSX flag being +# set. It should also be noted that ISA specification doesn't prohibit +# alignment exceptions for these instructions on page boundaries. +# Initially alignment was handled in pure AltiVec/VMX way [when data +# is aligned programmatically, which in turn guarantees exception- +# free execution], but it turned to hamper performance when vcipher +# instructions are interleaved. It's reckoned that eventual +# misalignment penalties at page boundaries are in average lower +# than additional overhead in pure AltiVec approach. + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; + $UCMP ="cmpld"; + $SHL ="sldi"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; + $SHL ="slwi"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=8*$SIZE_T; +$prefix="aes_p8"; + +$sp="r1"; +$vrsave="r12"; + +######################################################################### +{{{ # Key setup procedures # +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); + +$code.=<<___; +.machine "any" + +.text + +.align 7 +rcon: +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev +.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev +.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev +.long 0,0,0,0 ?asis +Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr $ptr #vvvvv "distance between . and rcon + addi $ptr,$ptr,-0x48 + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.asciz "AES for PowerISA 2.07, CRYPTOGAMS by " + +.globl .${prefix}_set_encrypt_key +.align 5 +.${prefix}_set_encrypt_key: +Lset_encrypt_key: + mflr r11 + $PUSH r11,$LRSAVE($sp) + + li $ptr,-1 + ${UCMP}i $inp,0 + beq- Lenc_key_abort # if ($inp==0) return -1; + ${UCMP}i $out,0 + beq- Lenc_key_abort # if ($out==0) return -1; + li $ptr,-2 + cmpwi $bits,128 + blt- Lenc_key_abort + cmpwi $bits,256 + bgt- Lenc_key_abort + andi. r0,$bits,0x3f + bne- Lenc_key_abort + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + bl Lconsts + mtlr r11 + + neg r9,$inp + lvx $in0,0,$inp + addi $inp,$inp,15 # 15 is not typo + lvsr $key,0,r9 # borrow $key + li r8,0x20 + cmpwi $bits,192 + lvx $in1,0,$inp + le?vspltisb $mask,0x0f # borrow $mask + lvx $rcon,0,$ptr + le?vxor $key,$key,$mask # adjust for byte swap + lvx $mask,r8,$ptr + addi $ptr,$ptr,0x10 + vperm $in0,$in0,$in1,$key # align [and byte swap in LE] + li $cnt,8 + vxor $zero,$zero,$zero + mtctr $cnt + + ?lvsr $outperm,0,$out + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$zero,$outmask,$outperm + + blt Loop128 + addi $inp,$inp,8 + beq L192 + addi $inp,$inp,8 + b L256 + +.align 4 +Loop128: + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + bdnz Loop128 + + lvx $rcon,0,$ptr # last two round keys + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + + addi $inp,$out,15 # 15 is not typo + addi $out,$out,0x50 + + li $rounds,10 + b Ldone + +.align 4 +L192: + lvx $tmp,0,$inp + li $cnt,4 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + vspltisb $key,8 # borrow $key + mtctr $cnt + vsububm $mask,$mask,$key # adjust the mask + +Loop192: + vperm $key,$in1,$in1,$mask # roate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vcipherlast $key,$key,$rcon + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + + vsldoi $stage,$zero,$in1,8 + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vsldoi $stage,$stage,$in0,8 + + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vsldoi $stage,$in0,$in1,8 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + stvx $stage,0,$out + addi $out,$out,16 + + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdnz Loop192 + + li $rounds,12 + addi $out,$out,0x20 + b Ldone + +.align 4 +L256: + lvx $tmp,0,$inp + li $cnt,7 + li $rounds,14 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + mtctr $cnt + +Loop256: + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in1,$in1,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdz Ldone + + vspltw $key,$in0,3 # just splat + vsldoi $tmp,$zero,$in1,12 # >>32 + vsbox $key,$key + + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + + vxor $in1,$in1,$key + b Loop256 + +.align 4 +Ldone: + lvx $in1,0,$inp # redundant in aligned case + vsel $in1,$outhead,$in1,$outmask + stvx $in1,0,$inp + li $ptr,0 + mtspr 256,$vrsave + stw $rounds,0($out) + +Lenc_key_abort: + mr r3,$ptr + blr + .long 0 + .byte 0,12,0x14,1,0,0,3,0 + .long 0 +.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key + +.globl .${prefix}_set_decrypt_key +.align 5 +.${prefix}_set_decrypt_key: + $STU $sp,-$FRAME($sp) + mflr r10 + $PUSH r10,$FRAME+$LRSAVE($sp) + bl Lset_encrypt_key + mtlr r10 + + cmpwi r3,0 + bne- Ldec_key_abort + + slwi $cnt,$rounds,4 + subi $inp,$out,240 # first round key + srwi $rounds,$rounds,1 + add $out,$inp,$cnt # last round key + mtctr $rounds + +Ldeckey: + lwz r0, 0($inp) + lwz r6, 4($inp) + lwz r7, 8($inp) + lwz r8, 12($inp) + addi $inp,$inp,16 + lwz r9, 0($out) + lwz r10,4($out) + lwz r11,8($out) + lwz r12,12($out) + stw r0, 0($out) + stw r6, 4($out) + stw r7, 8($out) + stw r8, 12($out) + subi $out,$out,16 + stw r9, -16($inp) + stw r10,-12($inp) + stw r11,-8($inp) + stw r12,-4($inp) + bdnz Ldeckey + + xor r3,r3,r3 # return value +Ldec_key_abort: + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,0,3,0 + .long 0 +.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key +___ +}}} +######################################################################### +{{{ # Single block en- and decrypt procedures # +sub gen_block () { +my $dir = shift; +my $n = $dir eq "de" ? "n" : ""; +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); + +$code.=<<___; +.globl .${prefix}_${dir}crypt +.align 5 +.${prefix}_${dir}crypt: + lwz $rounds,240($key) + lis r0,0xfc00 + mfspr $vrsave,256 + li $idx,15 # 15 is not typo + mtspr 256,r0 + + lvx v0,0,$inp + neg r11,$out + lvx v1,$idx,$inp + lvsl v2,0,$inp # inpperm + le?vspltisb v4,0x0f + ?lvsl v3,0,r11 # outperm + le?vxor v2,v2,v4 + li $idx,16 + vperm v0,v0,v1,v2 # align [and byte swap in LE] + lvx v1,0,$key + ?lvsl v5,0,$key # keyperm + srwi $rounds,$rounds,1 + lvx v2,$idx,$key + addi $idx,$idx,16 + subi $rounds,$rounds,1 + ?vperm v1,v1,v2,v5 # align round key + + vxor v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + mtctr $rounds + +Loop_${dir}c: + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + addi $idx,$idx,16 + ?vperm v1,v1,v2,v5 + v${n}cipher v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + bdnz Loop_${dir}c + + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + ?vperm v1,v1,v2,v5 + v${n}cipherlast v0,v0,v1 + + vspltisb v2,-1 + vxor v1,v1,v1 + li $idx,15 # 15 is not typo + ?vperm v2,v1,v2,v3 # outmask + le?vxor v3,v3,v4 + lvx v1,0,$out # outhead + vperm v0,v0,v0,v3 # rotate [and byte swap in LE] + vsel v1,v1,v0,v2 + lvx v4,$idx,$out + stvx v1,0,$out + vsel v0,v0,v4,v2 + stvx v0,$idx,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +######################################################################### +{{{ # CBC en- and decrypt procedures # +my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= + map("v$_",(4..10)); +$code.=<<___; +.globl .${prefix}_cbc_encrypt +.align 5 +.${prefix}_cbc_encrypt: + ${UCMP}i $len,16 + bltlr- + + cmpwi $enc,0 # test direction + lis r0,0xffe0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + beq Lcbc_dec + +Lcbc_enc: + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $inout,$inout,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + vxor $inout,$inout,$ivec + +Loop_cbc_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $ivec,$inout,$rndkey0 + ${UCMP}i $len,16 + + vperm $tmp,$ivec,$ivec,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_enc + + b Lcbc_done + +.align 4 +Lcbc_dec: + ${UCMP}i $len,128 + bge _aesp8_cbc_decrypt8x + vmr $tmp,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $tmp,$tmp,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$tmp,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + +Loop_cbc_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipherlast $inout,$inout,$rndkey0 + ${UCMP}i $len,16 + + vxor $inout,$inout,$ivec + vmr $ivec,$tmp + vperm $tmp,$inout,$inout,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_dec + +Lcbc_done: + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + neg $enc,$ivp # write [unaligned] iv + li $idx,15 # 15 is not typo + vxor $rndkey0,$rndkey0,$rndkey0 + vspltisb $outmask,-1 + le?vspltisb $tmp,0x0f + ?lvsl $outperm,0,$enc + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + lvx $outhead,0,$ivp + vperm $ivec,$ivec,$ivec,$outperm + vsel $inout,$outhead,$ivec,$outmask + lvx $inptail,$idx,$ivp + stvx $inout,0,$ivp + vsel $inout,$ivec,$inptail,$outmask + stvx $inout,$idx,$ivp + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CBC decrypt procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment + +$code.=<<___; +.align 5 +_aesp8_cbc_decrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + subi $len,$len,128 # bias + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_cbc_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_cbc_dec_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + #lvx $inptail,0,$inp # "caller" already did this + #addi $inp,$inp,15 # 15 is not typo + subi $inp,$inp,15 # undo "caller" + + le?li $idx,8 + lvx_u $in0,$x00,$inp # load first 8 "words" + le?lvsl $inpperm,0,$idx + le?vspltisb $tmp,0x0f + lvx_u $in1,$x10,$inp + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + lvx_u $in2,$x20,$inp + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in3,$x30,$inp + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in4,$x40,$inp + le?vperm $in2,$in2,$in2,$inpperm + vxor $out0,$in0,$rndkey0 + lvx_u $in5,$x50,$inp + le?vperm $in3,$in3,$in3,$inpperm + vxor $out1,$in1,$rndkey0 + lvx_u $in6,$x60,$inp + le?vperm $in4,$in4,$in4,$inpperm + vxor $out2,$in2,$rndkey0 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + le?vperm $in5,$in5,$in5,$inpperm + vxor $out3,$in3,$rndkey0 + le?vperm $in6,$in6,$in6,$inpperm + vxor $out4,$in4,$rndkey0 + le?vperm $in7,$in7,$in7,$inpperm + vxor $out5,$in5,$rndkey0 + vxor $out6,$in6,$rndkey0 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + b Loop_cbc_dec8x +.align 5 +Loop_cbc_dec8x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x + + subic $len,$len,128 # $len-=128 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + and r0,r0,$len + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vncipher $out0,$out0,v30 + vxor $ivec,$ivec,v31 # xor with last round key + vncipher $out1,$out1,v30 + vxor $in0,$in0,v31 + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + vncipherlast $out0,$out0,$ivec + vncipherlast $out1,$out1,$in0 + lvx_u $in0,$x00,$inp # load next input block + vncipherlast $out2,$out2,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out3,$out3,$in2 + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in2,$x20,$inp + vncipherlast $out4,$out4,$in3 + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in3,$x30,$inp + vncipherlast $out5,$out5,$in4 + le?vperm $in2,$in2,$in2,$inpperm + lvx_u $in4,$x40,$inp + vncipherlast $out6,$out6,$in5 + le?vperm $in3,$in3,$in3,$inpperm + lvx_u $in5,$x50,$inp + vncipherlast $out7,$out7,$in6 + le?vperm $in4,$in4,$in4,$inpperm + lvx_u $in6,$x60,$inp + vmr $ivec,$in7 + le?vperm $in5,$in5,$in5,$inpperm + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $in6,$in6,$in6,$inpperm + vxor $out0,$in0,$rndkey0 + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $in7,$in7,$in7,$inpperm + vxor $out1,$in1,$rndkey0 + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$rndkey0 + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$rndkey0 + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$rndkey0 + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + vxor $out5,$in5,$rndkey0 + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + vxor $out6,$in6,$rndkey0 + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + beq Loop_cbc_dec8x # did $len-=128 borrow? + + addic. $len,$len,128 + beq Lcbc_dec8x_done + nop + nop + +Loop_cbc_dec8x_tail: # up to 7 "words" tail... + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x_tail + + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + + vncipher $out1,$out1,v30 + vxor $ivec,$ivec,v31 # last round key + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + cmplwi $len,32 # switch($len) + blt Lcbc_dec8x_one + nop + beq Lcbc_dec8x_two + cmplwi $len,64 + blt Lcbc_dec8x_three + nop + beq Lcbc_dec8x_four + cmplwi $len,96 + blt Lcbc_dec8x_five + nop + beq Lcbc_dec8x_six + +Lcbc_dec8x_seven: + vncipherlast $out1,$out1,$ivec + vncipherlast $out2,$out2,$in1 + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out1,$out1,$out1,$inpperm + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x00,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x10,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x20,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x30,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x40,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x50,$out + stvx_u $out7,$x60,$out + addi $out,$out,0x70 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_six: + vncipherlast $out2,$out2,$ivec + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out2,$out2,$out2,$inpperm + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x00,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x10,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x20,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x30,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x40,$out + stvx_u $out7,$x50,$out + addi $out,$out,0x60 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_five: + vncipherlast $out3,$out3,$ivec + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out3,$out3,$out3,$inpperm + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x00,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x10,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x20,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x30,$out + stvx_u $out7,$x40,$out + addi $out,$out,0x50 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_four: + vncipherlast $out4,$out4,$ivec + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out4,$out4,$out4,$inpperm + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x00,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x10,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x20,$out + stvx_u $out7,$x30,$out + addi $out,$out,0x40 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_three: + vncipherlast $out5,$out5,$ivec + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out5,$out5,$out5,$inpperm + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x00,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x10,$out + stvx_u $out7,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_two: + vncipherlast $out6,$out6,$ivec + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out6,$out6,$out6,$inpperm + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x00,$out + stvx_u $out7,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_one: + vncipherlast $out7,$out7,$ivec + vmr $ivec,$in7 + + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out7,0,$out + addi $out,$out,0x10 + +Lcbc_dec8x_done: + le?vperm $ivec,$ivec,$ivec,$inpperm + stvx_u $ivec,0,$ivp # write [unaligned] iv + + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt +___ +}} }}} + +######################################################################### +{{{ # CTR procedure[s] # +my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= + map("v$_",(4..11)); +my $dat=$tmp; + +$code.=<<___; +.globl .${prefix}_ctr32_encrypt_blocks +.align 5 +.${prefix}_ctr32_encrypt_blocks: + ${UCMP}i $len,1 + bltlr- + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + vspltisb $one,1 + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + vsldoi $one,$rndkey0,$one,1 + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + + ${UCMP}i $len,8 + bge _aesp8_ctr32_encrypt8x + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + lvx $rndkey0,0,$key + mtctr $rounds + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + b Loop_ctr32_enc + +.align 5 +Loop_ctr32_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_ctr32_enc + + vadduwm $ivec,$ivec,$one + vmr $dat,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + subic. $len,$len,1 # blocks-- + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + vperm $dat,$dat,$inptail,$inpperm + li $idx,16 + ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm + lvx $rndkey0,0,$key + vxor $dat,$dat,$rndkey1 # last round key + vcipherlast $inout,$inout,$dat + + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + vperm $inout,$inout,$inout,$outperm + vsel $dat,$outhead,$inout,$outmask + mtctr $rounds + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vmr $outhead,$inout + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + stvx $dat,0,$out + addi $out,$out,16 + bne Loop_ctr32_enc + + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CTR procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment +my ($two,$three,$four)=($outhead,$outperm,$outmask); + +$code.=<<___; +.align 5 +_aesp8_ctr32_encrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_ctr32_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_ctr32_enc_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vadduwm $two,$one,$one + subi $inp,$inp,15 # undo "caller" + $SHL $len,$len,4 + + vadduwm $out1,$ivec,$one # counter values ... + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + le?li $idx,8 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + le?lvsl $inpperm,0,$idx + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + le?vspltisb $tmp,0x0f + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + vxor $out7,$out7,$rndkey0 + + mtctr $rounds + b Loop_ctr32_enc8x +.align 5 +Loop_ctr32_enc8x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 +Loop_ctr32_enc8x_middle: + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_ctr32_enc8x + + subic r11,$len,256 # $len-256, borrow $key_ + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 + + subfe r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + + and r0,r0,r11 + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + vcipher $out6,$out6,v26 + vcipher $out7,$out7,v26 + lvx v24,$x00,$key_ # re-pre-load round[1] + + subic $len,$len,129 # $len-=129 + vcipher $out0,$out0,v27 + addi $len,$len,1 # $len-=128 really + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + vcipher $out6,$out6,v27 + vcipher $out7,$out7,v27 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vcipher $out0,$out0,v28 + lvx_u $in0,$x00,$inp # load input + vcipher $out1,$out1,v28 + lvx_u $in1,$x10,$inp + vcipher $out2,$out2,v28 + lvx_u $in2,$x20,$inp + vcipher $out3,$out3,v28 + lvx_u $in3,$x30,$inp + vcipher $out4,$out4,v28 + lvx_u $in4,$x40,$inp + vcipher $out5,$out5,v28 + lvx_u $in5,$x50,$inp + vcipher $out6,$out6,v28 + lvx_u $in6,$x60,$inp + vcipher $out7,$out7,v28 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v29 + le?vperm $in1,$in1,$in1,$inpperm + vcipher $out2,$out2,v29 + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out3,$out3,v29 + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out4,$out4,v29 + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out5,$out5,v29 + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out6,$out6,v29 + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out7,$out7,v29 + le?vperm $in7,$in7,$in7,$inpperm + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + subfe. r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v30 + vxor $in0,$in0,v31 # xor with last round key + vcipher $out1,$out1,v30 + vxor $in1,$in1,v31 + vcipher $out2,$out2,v30 + vxor $in2,$in2,v31 + vcipher $out3,$out3,v30 + vxor $in3,$in3,v31 + vcipher $out4,$out4,v30 + vxor $in4,$in4,v31 + vcipher $out5,$out5,v30 + vxor $in5,$in5,v31 + vcipher $out6,$out6,v30 + vxor $in6,$in6,v31 + vcipher $out7,$out7,v30 + vxor $in7,$in7,v31 + + bne Lctr32_enc8x_break # did $len-129 borrow? + + vcipherlast $in0,$out0,$in0 + vcipherlast $in1,$out1,$in1 + vadduwm $out1,$ivec,$one # counter values ... + vcipherlast $in2,$out2,$in2 + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + vcipherlast $in3,$out3,$in3 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + vcipherlast $in4,$out4,$in4 + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + vcipherlast $in5,$out5,$in5 + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + vcipherlast $in6,$out6,$in6 + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vcipherlast $in7,$out7,$in7 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + le?vperm $in0,$in0,$in0,$inpperm + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + le?vperm $in1,$in1,$in1,$inpperm + vxor $out7,$out7,$rndkey0 + mtctr $rounds + + vcipher $out0,$out0,v24 + stvx_u $in0,$x00,$out + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out1,$out1,v24 + stvx_u $in1,$x10,$out + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out2,$out2,v24 + stvx_u $in2,$x20,$out + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out3,$out3,v24 + stvx_u $in3,$x30,$out + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out4,$out4,v24 + stvx_u $in4,$x40,$out + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out5,$out5,v24 + stvx_u $in5,$x50,$out + le?vperm $in7,$in7,$in7,$inpperm + vcipher $out6,$out6,v24 + stvx_u $in6,$x60,$out + vcipher $out7,$out7,v24 + stvx_u $in7,$x70,$out + addi $out,$out,0x80 + + b Loop_ctr32_enc8x_middle + +.align 5 +Lctr32_enc8x_break: + cmpwi $len,-0x60 + blt Lctr32_enc8x_one + nop + beq Lctr32_enc8x_two + cmpwi $len,-0x40 + blt Lctr32_enc8x_three + nop + beq Lctr32_enc8x_four + cmpwi $len,-0x20 + blt Lctr32_enc8x_five + nop + beq Lctr32_enc8x_six + cmpwi $len,0x00 + blt Lctr32_enc8x_seven + +Lctr32_enc8x_eight: + vcipherlast $out0,$out0,$in0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + vcipherlast $out5,$out5,$in5 + vcipherlast $out6,$out6,$in6 + vcipherlast $out7,$out7,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_seven: + vcipherlast $out0,$out0,$in1 + vcipherlast $out1,$out1,$in2 + vcipherlast $out2,$out2,$in3 + vcipherlast $out3,$out3,$in4 + vcipherlast $out4,$out4,$in5 + vcipherlast $out5,$out5,$in6 + vcipherlast $out6,$out6,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + stvx_u $out6,$x60,$out + addi $out,$out,0x70 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_six: + vcipherlast $out0,$out0,$in2 + vcipherlast $out1,$out1,$in3 + vcipherlast $out2,$out2,$in4 + vcipherlast $out3,$out3,$in5 + vcipherlast $out4,$out4,$in6 + vcipherlast $out5,$out5,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + stvx_u $out5,$x50,$out + addi $out,$out,0x60 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_five: + vcipherlast $out0,$out0,$in3 + vcipherlast $out1,$out1,$in4 + vcipherlast $out2,$out2,$in5 + vcipherlast $out3,$out3,$in6 + vcipherlast $out4,$out4,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_four: + vcipherlast $out0,$out0,$in4 + vcipherlast $out1,$out1,$in5 + vcipherlast $out2,$out2,$in6 + vcipherlast $out3,$out3,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_three: + vcipherlast $out0,$out0,$in5 + vcipherlast $out1,$out1,$in6 + vcipherlast $out2,$out2,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lctr32_enc8x_two: + vcipherlast $out0,$out0,$in6 + vcipherlast $out1,$out1,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lctr32_enc8x_one: + vcipherlast $out0,$out0,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + stvx_u $out0,0,$out + addi $out,$out,0x10 + +Lctr32_enc8x_done: + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks +___ +}} }}} + +my $consts=1; +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + # constants table endian-specific conversion + if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$3; + my @bytes=(); + + # convert to endian-agnostic format + if ($1 eq "long") { + foreach (split(/,\s*/,$2)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + } else { + @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o or + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/le\?/#le#/o or + s/be\?//o or + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl/crypto/aes/asm/aest4-sparcv9.pl b/openssl/crypto/aes/asm/aest4-sparcv9.pl new file mode 100755 index 000000000..536f23b47 --- /dev/null +++ b/openssl/crypto/aes/asm/aest4-sparcv9.pl @@ -0,0 +1,919 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by David S. Miller and Andy Polyakov +# . The module is licensed under 2-clause BSD +# license. October 2012. All rights reserved. +# ==================================================================== + +###################################################################### +# AES for SPARC T4. +# +# AES round instructions complete in 3 cycles and can be issued every +# cycle. It means that round calculations should take 4*rounds cycles, +# because any given round instruction depends on result of *both* +# previous instructions: +# +# |0 |1 |2 |3 |4 +# |01|01|01| +# |23|23|23| +# |01|01|... +# |23|... +# +# Provided that fxor [with IV] takes 3 cycles to complete, critical +# path length for CBC encrypt would be 3+4*rounds, or in other words +# it should process one byte in at least (3+4*rounds)/16 cycles. This +# estimate doesn't account for "collateral" instructions, such as +# fetching input from memory, xor-ing it with zero-round key and +# storing the result. Yet, *measured* performance [for data aligned +# at 64-bit boundary!] deviates from this equation by less than 0.5%: +# +# 128-bit key 192- 256- +# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90 +# (*) numbers after slash are for +# misaligned data. +# +# Out-of-order execution logic managed to fully overlap "collateral" +# instructions with those on critical path. Amazing! +# +# As with Intel AES-NI, question is if it's possible to improve +# performance of parallelizeable modes by interleaving round +# instructions. Provided round instruction latency and throughput +# optimal interleave factor is 2. But can we expect 2x performance +# improvement? Well, as round instructions can be issued one per +# cycle, they don't saturate the 2-way issue pipeline and therefore +# there is room for "collateral" calculations... Yet, 2x speed-up +# over CBC encrypt remains unattaintable: +# +# 128-bit key 192- 256- +# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61 +# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61 +# (*) numbers after slash are for +# misaligned data. +# +# Estimates based on amount of instructions under assumption that +# round instructions are not pairable with any other instruction +# suggest that latter is the actual case and pipeline runs +# underutilized. It should be noted that T4 out-of-order execution +# logic is so capable that performance gain from 2x interleave is +# not even impressive, ~7-13% over non-interleaved code, largest +# for 256-bit keys. + +# To anchor to something else, software implementation processes +# one byte in 29 cycles with 128-bit key on same processor. Intel +# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts +# in 0.93, naturally with AES-NI. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "sparcv9_modes.pl"; + +&asm_init(@ARGV); + +$::evp=1; # if $evp is set to 0, script generates module with +# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry +# points. These however are not fully compatible with openssl/aes.h, +# because they expect AES_KEY to be aligned at 64-bit boundary. When +# used through EVP, alignment is arranged at EVP layer. Second thing +# that is arranged by EVP is at least 32-bit alignment of IV. + +###################################################################### +# single-round subroutines +# +{ +my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); + +$code.=<<___ if ($::abibits==64); +.register %g2,#scratch +.register %g3,#scratch + +___ +$code.=<<___; +.text + +.globl aes_t4_encrypt +.align 32 +aes_t4_encrypt: + andcc $inp, 7, %g1 ! is input aligned? + andn $inp, 7, $inp + + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 + + ldx [$inp + 0], %o4 + bz,pt %icc, 1f + ldx [$inp + 8], %o5 + ldx [$inp + 16], $inp + sll %g1, 3, %g1 + sub %g0, %g1, %o3 + sllx %o4, %g1, %o4 + sllx %o5, %g1, %g1 + srlx %o5, %o3, %o5 + srlx $inp, %o3, %o3 + or %o5, %o4, %o4 + or %o3, %g1, %o5 +1: + ld [$key + 240], $rounds + ldd [$key + 16], %f12 + ldd [$key + 24], %f14 + xor %g4, %o4, %o4 + xor %g5, %o5, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + srl $rounds, 1, $rounds + ldd [$key + 32], %f16 + sub $rounds, 1, $rounds + ldd [$key + 40], %f18 + add $key, 48, $key + +.Lenc: + aes_eround01 %f12, %f0, %f2, %f4 + aes_eround23 %f14, %f0, %f2, %f2 + ldd [$key + 0], %f12 + ldd [$key + 8], %f14 + sub $rounds,1,$rounds + aes_eround01 %f16, %f4, %f2, %f0 + aes_eround23 %f18, %f4, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + brnz,pt $rounds, .Lenc + add $key, 32, $key + + andcc $out, 7, $tmp ! is output aligned? + aes_eround01 %f12, %f0, %f2, %f4 + aes_eround23 %f14, %f0, %f2, %f2 + aes_eround01_l %f16, %f4, %f2, %f0 + aes_eround23_l %f18, %f4, %f2, %f2 + + bnz,pn %icc, 2f + nop + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +2: alignaddrl $out, %g0, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + + faligndata %f0, %f0, %f4 + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $mask, $mask + retl + stda %f8, [$out + $mask]0xc0 ! partial store +.type aes_t4_encrypt,#function +.size aes_t4_encrypt,.-aes_t4_encrypt + +.globl aes_t4_decrypt +.align 32 +aes_t4_decrypt: + andcc $inp, 7, %g1 ! is input aligned? + andn $inp, 7, $inp + + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 + + ldx [$inp + 0], %o4 + bz,pt %icc, 1f + ldx [$inp + 8], %o5 + ldx [$inp + 16], $inp + sll %g1, 3, %g1 + sub %g0, %g1, %o3 + sllx %o4, %g1, %o4 + sllx %o5, %g1, %g1 + srlx %o5, %o3, %o5 + srlx $inp, %o3, %o3 + or %o5, %o4, %o4 + or %o3, %g1, %o5 +1: + ld [$key + 240], $rounds + ldd [$key + 16], %f12 + ldd [$key + 24], %f14 + xor %g4, %o4, %o4 + xor %g5, %o5, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + srl $rounds, 1, $rounds + ldd [$key + 32], %f16 + sub $rounds, 1, $rounds + ldd [$key + 40], %f18 + add $key, 48, $key + +.Ldec: + aes_dround01 %f12, %f0, %f2, %f4 + aes_dround23 %f14, %f0, %f2, %f2 + ldd [$key + 0], %f12 + ldd [$key + 8], %f14 + sub $rounds,1,$rounds + aes_dround01 %f16, %f4, %f2, %f0 + aes_dround23 %f18, %f4, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + brnz,pt $rounds, .Ldec + add $key, 32, $key + + andcc $out, 7, $tmp ! is output aligned? + aes_dround01 %f12, %f0, %f2, %f4 + aes_dround23 %f14, %f0, %f2, %f2 + aes_dround01_l %f16, %f4, %f2, %f0 + aes_dround23_l %f18, %f4, %f2, %f2 + + bnz,pn %icc, 2f + nop + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +2: alignaddrl $out, %g0, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + + faligndata %f0, %f0, %f4 + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $mask, $mask + retl + stda %f8, [$out + $mask]0xc0 ! partial store +.type aes_t4_decrypt,#function +.size aes_t4_decrypt,.-aes_t4_decrypt +___ +} + +###################################################################### +# key setup subroutines +# +{ +my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); +$code.=<<___; +.globl aes_t4_set_encrypt_key +.align 32 +aes_t4_set_encrypt_key: +.Lset_encrypt_key: + and $inp, 7, $tmp + alignaddr $inp, %g0, $inp + cmp $bits, 192 + ldd [$inp + 0], %f0 + bl,pt %icc,.L128 + ldd [$inp + 8], %f2 + + be,pt %icc,.L192 + ldd [$inp + 16], %f4 + brz,pt $tmp, .L256aligned + ldd [$inp + 24], %f6 + + ldd [$inp + 32], %f8 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 + faligndata %f6, %f8, %f6 +.L256aligned: +___ +for ($i=0; $i<6; $i++) { + $code.=<<___; + std %f0, [$out + `32*$i+0`] + aes_kexpand1 %f0, %f6, $i, %f0 + std %f2, [$out + `32*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `32*$i+16`] + aes_kexpand0 %f4, %f2, %f4 + std %f6, [$out + `32*$i+24`] + aes_kexpand2 %f6, %f4, %f6 +___ +} +$code.=<<___; + std %f0, [$out + `32*$i+0`] + aes_kexpand1 %f0, %f6, $i, %f0 + std %f2, [$out + `32*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `32*$i+16`] + std %f6, [$out + `32*$i+24`] + std %f0, [$out + `32*$i+32`] + std %f2, [$out + `32*$i+40`] + + mov 14, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 + +.align 16 +.L192: + brz,pt $tmp, .L192aligned + nop + + ldd [$inp + 24], %f6 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 +.L192aligned: +___ +for ($i=0; $i<7; $i++) { + $code.=<<___; + std %f0, [$out + `24*$i+0`] + aes_kexpand1 %f0, %f4, $i, %f0 + std %f2, [$out + `24*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `24*$i+16`] + aes_kexpand2 %f4, %f2, %f4 +___ +} +$code.=<<___; + std %f0, [$out + `24*$i+0`] + aes_kexpand1 %f0, %f4, $i, %f0 + std %f2, [$out + `24*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `24*$i+16`] + std %f0, [$out + `24*$i+24`] + std %f2, [$out + `24*$i+32`] + + mov 12, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 + +.align 16 +.L128: + brz,pt $tmp, .L128aligned + nop + + ldd [$inp + 16], %f4 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 +.L128aligned: +___ +for ($i=0; $i<10; $i++) { + $code.=<<___; + std %f0, [$out + `16*$i+0`] + aes_kexpand1 %f0, %f2, $i, %f0 + std %f2, [$out + `16*$i+8`] + aes_kexpand2 %f2, %f0, %f2 +___ +} +$code.=<<___; + std %f0, [$out + `16*$i+0`] + std %f2, [$out + `16*$i+8`] + + mov 10, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 +.type aes_t4_set_encrypt_key,#function +.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key + +.globl aes_t4_set_decrypt_key +.align 32 +aes_t4_set_decrypt_key: + mov %o7, %o5 + call .Lset_encrypt_key + nop + + mov %o5, %o7 + sll $tmp, 4, $inp ! $tmp is number of rounds + add $tmp, 2, $tmp + add $out, $inp, $inp ! $inp=$out+16*rounds + srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4 + +.Lkey_flip: + ldd [$out + 0], %f0 + ldd [$out + 8], %f2 + ldd [$out + 16], %f4 + ldd [$out + 24], %f6 + ldd [$inp + 0], %f8 + ldd [$inp + 8], %f10 + ldd [$inp - 16], %f12 + ldd [$inp - 8], %f14 + sub $tmp, 1, $tmp + std %f0, [$inp + 0] + std %f2, [$inp + 8] + std %f4, [$inp - 16] + std %f6, [$inp - 8] + std %f8, [$out + 0] + std %f10, [$out + 8] + std %f12, [$out + 16] + std %f14, [$out + 24] + add $out, 32, $out + brnz $tmp, .Lkey_flip + sub $inp, 32, $inp + + retl + xor %o0, %o0, %o0 +.type aes_t4_set_decrypt_key,#function +.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key +___ +} + +{{{ +my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); +my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); + +$code.=<<___; +.align 32 +_aes128_encrypt_1x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f48, %f0, %f2, %f4 + aes_eround23 %f50, %f0, %f2, %f2 + aes_eround01_l %f52, %f4, %f2, %f0 + retl + aes_eround23_l %f54, %f4, %f2, %f2 +.type _aes128_encrypt_1x,#function +.size _aes128_encrypt_1x,.-_aes128_encrypt_1x + +.align 32 +_aes128_encrypt_2x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f48, %f0, %f2, %f8 + aes_eround23 %f50, %f0, %f2, %f2 + aes_eround01 %f48, %f4, %f6, %f10 + aes_eround23 %f50, %f4, %f6, %f6 + aes_eround01_l %f52, %f8, %f2, %f0 + aes_eround23_l %f54, %f8, %f2, %f2 + aes_eround01_l %f52, %f10, %f6, %f4 + retl + aes_eround23_l %f54, %f10, %f6, %f6 +.type _aes128_encrypt_2x,#function +.size _aes128_encrypt_2x,.-_aes128_encrypt_2x + +.align 32 +_aes128_loadkey: + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 +___ +for ($i=2; $i<22;$i++) { # load key schedule + $code.=<<___; + ldd [$key + `8*$i`], %f`12+2*$i` +___ +} +$code.=<<___; + retl + nop +.type _aes128_loadkey,#function +.size _aes128_loadkey,.-_aes128_loadkey +_aes128_load_enckey=_aes128_loadkey +_aes128_load_deckey=_aes128_loadkey + +___ + +&alg_cbc_encrypt_implement("aes",128); +if ($::evp) { + &alg_ctr32_implement("aes",128); + &alg_xts_implement("aes",128,"en"); + &alg_xts_implement("aes",128,"de"); +} +&alg_cbc_decrypt_implement("aes",128); + +$code.=<<___; +.align 32 +_aes128_decrypt_1x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f48, %f0, %f2, %f4 + aes_dround23 %f50, %f0, %f2, %f2 + aes_dround01_l %f52, %f4, %f2, %f0 + retl + aes_dround23_l %f54, %f4, %f2, %f2 +.type _aes128_decrypt_1x,#function +.size _aes128_decrypt_1x,.-_aes128_decrypt_1x + +.align 32 +_aes128_decrypt_2x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f48, %f0, %f2, %f8 + aes_dround23 %f50, %f0, %f2, %f2 + aes_dround01 %f48, %f4, %f6, %f10 + aes_dround23 %f50, %f4, %f6, %f6 + aes_dround01_l %f52, %f8, %f2, %f0 + aes_dround23_l %f54, %f8, %f2, %f2 + aes_dround01_l %f52, %f10, %f6, %f4 + retl + aes_dround23_l %f54, %f10, %f6, %f6 +.type _aes128_decrypt_2x,#function +.size _aes128_decrypt_2x,.-_aes128_decrypt_2x +___ + +$code.=<<___; +.align 32 +_aes192_encrypt_1x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f56, %f0, %f2, %f4 + aes_eround23 %f58, %f0, %f2, %f2 + aes_eround01_l %f60, %f4, %f2, %f0 + retl + aes_eround23_l %f62, %f4, %f2, %f2 +.type _aes192_encrypt_1x,#function +.size _aes192_encrypt_1x,.-_aes192_encrypt_1x + +.align 32 +_aes192_encrypt_2x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f56, %f0, %f2, %f8 + aes_eround23 %f58, %f0, %f2, %f2 + aes_eround01 %f56, %f4, %f6, %f10 + aes_eround23 %f58, %f4, %f6, %f6 + aes_eround01_l %f60, %f8, %f2, %f0 + aes_eround23_l %f62, %f8, %f2, %f2 + aes_eround01_l %f60, %f10, %f6, %f4 + retl + aes_eround23_l %f62, %f10, %f6, %f6 +.type _aes192_encrypt_2x,#function +.size _aes192_encrypt_2x,.-_aes192_encrypt_2x + +.align 32 +_aes256_encrypt_1x: + aes_eround01 %f16, %f0, %f2, %f4 + aes_eround23 %f18, %f0, %f2, %f2 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_eround01 %f20, %f4, %f2, %f0 + aes_eround23 %f22, %f4, %f2, %f2 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f16, %f0, %f2, %f4 + aes_eround23 %f18, %f0, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_eround01_l %f20, %f4, %f2, %f0 + aes_eround23_l %f22, %f4, %f2, %f2 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_encrypt_1x,#function +.size _aes256_encrypt_1x,.-_aes256_encrypt_1x + +.align 32 +_aes256_encrypt_2x: + aes_eround01 %f16, %f0, %f2, %f8 + aes_eround23 %f18, %f0, %f2, %f2 + aes_eround01 %f16, %f4, %f6, %f10 + aes_eround23 %f18, %f4, %f6, %f6 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_eround01 %f20, %f8, %f2, %f0 + aes_eround23 %f22, %f8, %f2, %f2 + aes_eround01 %f20, %f10, %f6, %f4 + aes_eround23 %f22, %f10, %f6, %f6 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f16, %f0, %f2, %f8 + aes_eround23 %f18, %f0, %f2, %f2 + aes_eround01 %f16, %f4, %f6, %f10 + aes_eround23 %f18, %f4, %f6, %f6 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_eround01_l %f20, %f8, %f2, %f0 + aes_eround23_l %f22, %f8, %f2, %f2 + aes_eround01_l %f20, %f10, %f6, %f4 + aes_eround23_l %f22, %f10, %f6, %f6 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_encrypt_2x,#function +.size _aes256_encrypt_2x,.-_aes256_encrypt_2x + +.align 32 +_aes192_loadkey: + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 +___ +for ($i=2; $i<26;$i++) { # load key schedule + $code.=<<___; + ldd [$key + `8*$i`], %f`12+2*$i` +___ +} +$code.=<<___; + retl + nop +.type _aes192_loadkey,#function +.size _aes192_loadkey,.-_aes192_loadkey +_aes256_loadkey=_aes192_loadkey +_aes192_load_enckey=_aes192_loadkey +_aes192_load_deckey=_aes192_loadkey +_aes256_load_enckey=_aes192_loadkey +_aes256_load_deckey=_aes192_loadkey +___ + +&alg_cbc_encrypt_implement("aes",256); +&alg_cbc_encrypt_implement("aes",192); +if ($::evp) { + &alg_ctr32_implement("aes",256); + &alg_xts_implement("aes",256,"en"); + &alg_xts_implement("aes",256,"de"); + &alg_ctr32_implement("aes",192); +} +&alg_cbc_decrypt_implement("aes",192); +&alg_cbc_decrypt_implement("aes",256); + +$code.=<<___; +.align 32 +_aes256_decrypt_1x: + aes_dround01 %f16, %f0, %f2, %f4 + aes_dround23 %f18, %f0, %f2, %f2 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_dround01 %f20, %f4, %f2, %f0 + aes_dround23 %f22, %f4, %f2, %f2 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f16, %f0, %f2, %f4 + aes_dround23 %f18, %f0, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_dround01_l %f20, %f4, %f2, %f0 + aes_dround23_l %f22, %f4, %f2, %f2 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_decrypt_1x,#function +.size _aes256_decrypt_1x,.-_aes256_decrypt_1x + +.align 32 +_aes256_decrypt_2x: + aes_dround01 %f16, %f0, %f2, %f8 + aes_dround23 %f18, %f0, %f2, %f2 + aes_dround01 %f16, %f4, %f6, %f10 + aes_dround23 %f18, %f4, %f6, %f6 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_dround01 %f20, %f8, %f2, %f0 + aes_dround23 %f22, %f8, %f2, %f2 + aes_dround01 %f20, %f10, %f6, %f4 + aes_dround23 %f22, %f10, %f6, %f6 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f16, %f0, %f2, %f8 + aes_dround23 %f18, %f0, %f2, %f2 + aes_dround01 %f16, %f4, %f6, %f10 + aes_dround23 %f18, %f4, %f6, %f6 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_dround01_l %f20, %f8, %f2, %f0 + aes_dround23_l %f22, %f8, %f2, %f2 + aes_dround01_l %f20, %f10, %f6, %f4 + aes_dround23_l %f22, %f10, %f6, %f6 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_decrypt_2x,#function +.size _aes256_decrypt_2x,.-_aes256_decrypt_2x + +.align 32 +_aes192_decrypt_1x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f56, %f0, %f2, %f4 + aes_dround23 %f58, %f0, %f2, %f2 + aes_dround01_l %f60, %f4, %f2, %f0 + retl + aes_dround23_l %f62, %f4, %f2, %f2 +.type _aes192_decrypt_1x,#function +.size _aes192_decrypt_1x,.-_aes192_decrypt_1x + +.align 32 +_aes192_decrypt_2x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f56, %f0, %f2, %f8 + aes_dround23 %f58, %f0, %f2, %f2 + aes_dround01 %f56, %f4, %f6, %f10 + aes_dround23 %f58, %f4, %f6, %f6 + aes_dround01_l %f60, %f8, %f2, %f0 + aes_dround23_l %f62, %f8, %f2, %f2 + aes_dround01_l %f60, %f10, %f6, %f4 + retl + aes_dround23_l %f62, %f10, %f6, %f6 +.type _aes192_decrypt_2x,#function +.size _aes192_decrypt_2x,.-_aes192_decrypt_2x +___ +}}} + +if (!$::evp) { +$code.=<<___; +.global AES_encrypt +AES_encrypt=aes_t4_encrypt +.global AES_decrypt +AES_decrypt=aes_t4_decrypt +.global AES_set_encrypt_key +.align 32 +AES_set_encrypt_key: + andcc %o2, 7, %g0 ! check alignment + bnz,a,pn %icc, 1f + mov -1, %o0 + brz,a,pn %o0, 1f + mov -1, %o0 + brz,a,pn %o2, 1f + mov -1, %o0 + andncc %o1, 0x1c0, %g0 + bnz,a,pn %icc, 1f + mov -2, %o0 + cmp %o1, 128 + bl,a,pn %icc, 1f + mov -2, %o0 + b aes_t4_set_encrypt_key + nop +1: retl + nop +.type AES_set_encrypt_key,#function +.size AES_set_encrypt_key,.-AES_set_encrypt_key + +.global AES_set_decrypt_key +.align 32 +AES_set_decrypt_key: + andcc %o2, 7, %g0 ! check alignment + bnz,a,pn %icc, 1f + mov -1, %o0 + brz,a,pn %o0, 1f + mov -1, %o0 + brz,a,pn %o2, 1f + mov -1, %o0 + andncc %o1, 0x1c0, %g0 + bnz,a,pn %icc, 1f + mov -2, %o0 + cmp %o1, 128 + bl,a,pn %icc, 1f + mov -2, %o0 + b aes_t4_set_decrypt_key + nop +1: retl + nop +.type AES_set_decrypt_key,#function +.size AES_set_decrypt_key,.-AES_set_decrypt_key +___ + +my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); + +$code.=<<___; +.globl AES_cbc_encrypt +.align 32 +AES_cbc_encrypt: + ld [$key + 240], %g1 + nop + brz $enc, .Lcbc_decrypt + cmp %g1, 12 + + bl,pt %icc, aes128_t4_cbc_encrypt + nop + be,pn %icc, aes192_t4_cbc_encrypt + nop + ba aes256_t4_cbc_encrypt + nop + +.Lcbc_decrypt: + bl,pt %icc, aes128_t4_cbc_decrypt + nop + be,pn %icc, aes192_t4_cbc_decrypt + nop + ba aes256_t4_cbc_decrypt + nop +.type AES_cbc_encrypt,#function +.size AES_cbc_encrypt,.-AES_cbc_encrypt +___ +} +$code.=<<___; +.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov" +.align 4 +___ + +&emit_assembler(); + +close STDOUT; diff --git a/openssl/crypto/aes/asm/aesv8-armx.pl b/openssl/crypto/aes/asm/aesv8-armx.pl new file mode 100755 index 000000000..1e93f8685 --- /dev/null +++ b/openssl/crypto/aes/asm/aesv8-armx.pl @@ -0,0 +1,962 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional NEON load and integer +# instructions. This has no effect on mighty Apple A7, where results +# are literally equal to the theoretical estimates based on AES +# instruction latencies and issue rates. On Cortex-A53, an in-order +# execution core, this costs up to 10-15%, which is partially +# compensated by implementing dedicated code path for 128-bit +# CBC encrypt case. On Cortex-A57 parallelizable mode performance +# seems to be limited by sheer amount of NEON instructions... +# +# Performance in cycles per byte processed with 128-bit key: +# +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A53 2.45 1.87 1.94 +# Cortex-A57 3.64 1.34 1.32 + +$flavour = shift; +open STDOUT,">".shift; + +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); + #^^^^^^ this is done to simplify adoption by not depending + # on latest binutils. + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +$code.=<<___; +.align 5 +rcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___; + mov $ptr,#-1 + cmp $inp,#0 + b.eq .Lenc_key_abort + cmp $out,#0 + b.eq .Lenc_key_abort + mov $ptr,#-2 + cmp $bits,#128 + b.lt .Lenc_key_abort + cmp $bits,#256 + b.gt .Lenc_key_abort + tst $bits,#0x3f + b.ne .Lenc_key_abort + + adr $ptr,rcon + cmp $bits,#192 + + veor $zero,$zero,$zero + vld1.8 {$in0},[$inp],#16 + mov $bits,#8 // reuse $bits + vld1.32 {$rcon,$mask},[$ptr],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + b.ne .Loop128 + + vld1.32 {$rcon},[$ptr] + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + veor $in0,$in0,$key + vst1.32 {$in0},[$out] + add $out,$out,#0x50 + + mov $rounds,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {$in1},[$inp],#8 + vmov.i8 $key,#8 // borrow $key + vst1.32 {$in0},[$out],#16 + vsub.i8 $mask,$mask,$key // adjust the mask + +.Loop192: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#8 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + + vdup.32 $tmp,${in0}[3] + veor $tmp,$tmp,$in1 + veor $key,$key,$rcon + vext.8 $in1,$zero,$in1,#12 + vshl.u8 $rcon,$rcon,#1 + veor $in1,$in1,$tmp + veor $in0,$in0,$key + veor $in1,$in1,$key + vst1.32 {$in0},[$out],#16 + b.ne .Loop192 + + mov $rounds,#12 + add $out,$out,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {$in1},[$inp] + mov $bits,#7 + mov $rounds,#14 + vst1.32 {$in0},[$out],#16 + +.Loop256: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + vst1.32 {$in0},[$out],#16 + b.eq .Ldone + + vdup.32 $key,${in0}[3] // just splat + vext.8 $tmp,$zero,$in1,#12 + aese $key,$zero + + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + + veor $in1,$in1,$key + b .Loop256 + +.Ldone: + str $rounds,[$out] + mov $ptr,#0 + +.Lenc_key_abort: + mov x0,$ptr // return value + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key + +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + stmdb sp!,{r4,lr} +___ +$code.=<<___; + bl .Lenc_key + + cmp x0,#0 + b.ne .Ldec_key_abort + + sub $out,$out,#240 // restore original $out + mov x4,#-16 + add $inp,$out,x12,lsl#4 // end of key schedule + + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + +.Loop_imc: + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + cmp $inp,$out + b.hi .Loop_imc + + vld1.32 {v0.16b},[$out] + aesimc v0.16b,v0.16b + vst1.32 {v0.16b},[$inp] + + eor x0,x0,x0 // return value +.Ldec_key_abort: +___ +$code.=<<___ if ($flavour !~ /64/); + ldmia sp!,{r4,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldp x29,x30,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} +{{{ +sub gen_block () { +my $dir = shift; +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); +my ($inp,$out,$key)=map("x$_",(0..2)); +my $rounds="w3"; +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ldr $rounds,[$key,#240] + vld1.32 {$rndkey0},[$key],#16 + vld1.8 {$inout},[$inp] + sub $rounds,$rounds,#2 + vld1.32 {$rndkey1},[$key],#16 + +.Loop_${dir}c: + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key],#16 + aes$mc $inout,$inout + subs $rounds,$rounds,#2 + aes$e $inout,$rndkey1 + vld1.32 {$rndkey1},[$key],#16 + aes$mc $inout,$inout + b.gt .Loop_${dir}c + + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key] + aes$mc $inout,$inout + aes$e $inout,$rndkey1 + veor $inout,$inout,$rndkey0 + + vst1.8 {$inout},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args +___ +$code.=<<___; + subs $len,$len,#16 + mov $step,#16 + b.lo .Lcbc_abort + cclr $step,eq + + cmp $enc,#0 // en- or decrypting? + ldr $rounds,[$key,#240] + and $len,$len,#-16 + vld1.8 {$ivec},[$ivp] + vld1.8 {$dat},[$inp],$step + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + b.eq .Lcbc_dec + + cmp $rounds,#2 + veor $dat,$dat,$ivec + veor $rndzero_n_last,q8,$rndlast + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Loop_cbc_enc + + aese $dat,q8 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,q9 + aesmc $dat,$dat + cclr $step,eq + aese $dat,q10 + aesmc $dat,$dat + add $key_,$key,#16 + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q13 + aesmc $dat,$dat + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aese $dat,q14 + aesmc $dat,$dat + aese $dat,q15 + + mov $cnt,$rounds + veor $ivec,$dat,$rndlast + vst1.8 {$ivec},[$out],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {$in0-$in1},[$key_] + aese $dat,q8 + aesmc $dat,$dat + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese $dat,q8 + aesmc $dat,$dat + vst1.8 {$ivec},[$out],#16 +.Lenter_cbc_enc128: + aese $dat,q9 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,$in0 + aesmc $dat,$dat + cclr $step,eq + aese $dat,$in1 + aesmc $dat,$dat + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q15 + veor $ivec,$dat,$rndlast + b.hs .Loop_cbc_enc128 + + vst1.8 {$ivec},[$out],#16 + b .Lcbc_done +___ +{ +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); +$code.=<<___; +.align 5 +.Lcbc_dec: + vld1.8 {$dat2},[$inp],#16 + subs $len,$len,#32 // bias + add $cnt,$rounds,#2 + vorr $in1,$dat,$dat + vorr $dat1,$dat,$dat + vorr $in2,$dat2,$dat2 + b.lo .Lcbc_dec_tail + + vorr $dat1,$dat2,$dat2 + vld1.8 {$dat2},[$inp],#16 + vorr $in0,$dat,$dat + vorr $in1,$dat1,$dat1 + vorr $in2,$dat2,$dat2 + +.Loop3x_cbc_dec: + aesd $dat0,q8 + aesd $dat1,q8 + aesd $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + subs $cnt,$cnt,#2 + aesd $dat0,q9 + aesd $dat1,q9 + aesd $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + b.gt .Loop3x_cbc_dec + + aesd $dat0,q8 + aesd $dat1,q8 + aesd $dat2,q8 + veor $tmp0,$ivec,$rndlast + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp1,$in0,$rndlast + aesd $dat0,q9 + aesd $dat1,q9 + aesd $dat2,q9 + veor $tmp2,$in1,$rndlast + subs $len,$len,#0x30 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vorr $ivec,$in2,$in2 + mov.lo x6,$len // x6, $cnt, is zero at this point + aesd $dat0,q12 + aesd $dat1,q12 + aesd $dat2,q12 + add $inp,$inp,x6 // $inp is adjusted in such way that + // at exit from the loop $dat1-$dat2 + // are loaded with last "words" + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + mov $key_,$key + aesd $dat0,q13 + aesd $dat1,q13 + aesd $dat2,q13 + vld1.8 {$in0},[$inp],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vld1.8 {$in1},[$inp],#16 + aesd $dat0,q14 + aesd $dat1,q14 + aesd $dat2,q14 + vld1.8 {$in2},[$inp],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesd $dat0,q15 + aesd $dat1,q15 + aesd $dat2,q15 + + add $cnt,$rounds,#2 + veor $tmp0,$tmp0,$dat0 + veor $tmp1,$tmp1,$dat1 + veor $dat2,$dat2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vorr $dat0,$in0,$in0 + vst1.8 {$tmp0},[$out],#16 + vorr $dat1,$in1,$in1 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$dat2},[$out],#16 + vorr $dat2,$in2,$in2 + b.hs .Loop3x_cbc_dec + + cmn $len,#0x30 + b.eq .Lcbc_done + nop + +.Lcbc_dec_tail: + aesd $dat1,q8 + aesd $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + subs $cnt,$cnt,#2 + aesd $dat1,q9 + aesd $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + b.gt .Lcbc_dec_tail + + aesd $dat1,q8 + aesd $dat2,q8 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q9 + aesd $dat2,q9 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q12 + aesd $dat2,q12 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + cmn $len,#0x20 + aesd $dat1,q13 + aesd $dat2,q13 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp1,$ivec,$rndlast + aesd $dat1,q14 + aesd $dat2,q14 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp2,$in1,$rndlast + aesd $dat1,q15 + aesd $dat2,q15 + b.eq .Lcbc_dec_one + veor $tmp1,$tmp1,$dat1 + veor $tmp2,$tmp2,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$tmp2},[$out],#16 + b .Lcbc_done + +.Lcbc_dec_one: + veor $tmp1,$tmp1,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 + +.Lcbc_done: + vst1.8 {$ivec},[$ivp] +.Lcbc_abort: +___ +} +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_)=("w5","w6","x7"); +my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); +my $step="x12"; # aliases with $tctr2 + +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#4 + mov $step,#16 + cmp $len,#2 + add $key_,$key,x5,lsl#4 // pointer to last 5 round keys + sub $rounds,$rounds,#2 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + add $key_,$key,#32 + mov $cnt,$rounds + cclr $step,lo +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + vorr $dat1,$dat0,$dat0 + add $tctr1, $ctr, #1 + vorr $dat2,$dat0,$dat0 + add $ctr, $ctr, #2 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${dat1}[3],$tctr1 + b.ls .Lctr32_tail + rev $tctr2, $ctr + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + aese $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + b.gt .Loop3x_ctr32 + + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + mov $key_,$key + aesmc $tmp0,$dat0 + vld1.8 {$in0},[$inp],#16 + aesmc $tmp1,$dat1 + aesmc $dat2,$dat2 + vorr $dat0,$ivec,$ivec + aese $tmp0,q9 + vld1.8 {$in1},[$inp],#16 + aese $tmp1,q9 + aese $dat2,q9 + vorr $dat1,$ivec,$ivec + aesmc $tmp0,$tmp0 + vld1.8 {$in2},[$inp],#16 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$dat2 + vorr $dat2,$ivec,$ivec + add $tctr0,$ctr,#1 + aese $tmp0,q12 + aese $tmp1,q12 + aese $tmp2,q12 + veor $in0,$in0,$rndlast + add $tctr1,$ctr,#2 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + veor $in1,$in1,$rndlast + add $ctr,$ctr,#3 + aese $tmp0,q13 + aese $tmp1,q13 + aese $tmp2,q13 + veor $in2,$in2,$rndlast + rev $tctr0,$tctr0 + aesmc $tmp0,$tmp0 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat0}[3], $tctr0 + rev $tctr1,$tctr1 + aese $tmp0,q14 + aese $tmp1,q14 + aese $tmp2,q14 + vmov.32 ${dat1}[3], $tctr1 + rev $tctr2,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat2}[3], $tctr2 + subs $len,$len,#3 + aese $tmp0,q15 + aese $tmp1,q15 + aese $tmp2,q15 + + mov $cnt,$rounds + veor $in0,$in0,$tmp0 + veor $in1,$in1,$tmp1 + veor $in2,$in2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$in0},[$out],#16 + vst1.8 {$in1},[$out],#16 + vst1.8 {$in2},[$out],#16 + b.hs .Loop3x_ctr32 + + adds $len,$len,#3 + b.eq .Lctr32_done + cmp $len,#1 + mov $step,#16 + cclr $step,eq + +.Lctr32_tail: + aese $dat0,q8 + aese $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + b.gt .Lctr32_tail + + aese $dat0,q8 + aese $dat1,q8 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q9 + aese $dat1,q9 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + vld1.8 {$in0},[$inp],$step + aese $dat0,q12 + aese $dat1,q12 + vld1.8 {$in1},[$inp] + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q13 + aese $dat1,q13 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q14 + aese $dat1,q14 + veor $in0,$in0,$rndlast + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + veor $in1,$in1,$rndlast + aese $dat0,q15 + aese $dat1,q15 + + cmp $len,#1 + veor $in0,$in0,$dat0 + veor $in1,$in1,$dat1 + vst1.8 {$in0},[$out],#16 + b.eq .Lctr32_done + vst1.8 {$in1},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) { ######## 64-bit code + my %opcode = ( + "aesd" => 0x4e285800, "aese" => 0x4e284800, + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5), + $mnemonic,$arg; + }; + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + # fix up remainig legacy suffixes + s/\.[ui]?8//o; + m/\],#8/o and s/\.16b/\.8b/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + my %opcode = ( + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + }; + + sub unvtbl { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + } + + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + } + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)mov\./$1mov/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; diff --git a/openssl/crypto/aes/asm/bsaes-armv7.pl b/openssl/crypto/aes/asm/bsaes-armv7.pl new file mode 100755 index 000000000..fcc81d1a4 --- /dev/null +++ b/openssl/crypto/aes/asm/bsaes-armv7.pl @@ -0,0 +1,2469 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel +# . Permission to use under GPL terms is +# granted. +# ==================================================================== + +# Bit-sliced AES for ARM NEON +# +# February 2012. +# +# This implementation is direct adaptation of bsaes-x86_64 module for +# ARM NEON. Except that this module is endian-neutral [in sense that +# it can be compiled for either endianness] by courtesy of vld1.8's +# neutrality. Initial version doesn't implement interface to OpenSSL, +# only low-level primitives and unsupported entry points, just enough +# to collect performance results, which for Cortex-A8 core are: +# +# encrypt 19.5 cycles per byte processed with 128-bit key +# decrypt 22.1 cycles per byte processed with 128-bit key +# key conv. 440 cycles per 128-bit key/0.18 of 8x block +# +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +# which is [much] worse than anticipated (for further details see +# http://www.openssl.org/~appro/Snapdragon-S4.html). +# +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +# manages in 20.0 cycles]. +# +# When comparing to x86_64 results keep in mind that NEON unit is +# [mostly] single-issue and thus can't [fully] benefit from +# instruction-level parallelism. And when comparing to aes-armv4 +# results keep in mind key schedule conversion overhead (see +# bsaes-x86_64.pl for further details)... +# +# + +# April-August 2013 +# +# Add CBC, CTR and XTS subroutines, adapt for kernel use. +# +# + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); +my @XMM=map("q$_",(0..15)); + +{ +my ($key,$rounds,$const)=("r4","r5","r6"); + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub Sbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InBasisChange (@b); + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); +} + +sub InBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[2], @b[2], @b[1] + veor @b[5], @b[5], @b[6] + veor @b[3], @b[3], @b[0] + veor @b[6], @b[6], @b[2] + veor @b[5], @b[5], @b[0] + + veor @b[6], @b[6], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[4], @b[4], @b[5] + + veor @b[2], @b[2], @b[7] + veor @b[3], @b[3], @b[1] + veor @b[1], @b[1], @b[5] +___ +} + +sub OutBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] + veor @b[4], @b[4], @b[6] + veor @b[2], @b[2], @b[0] + veor @b[6], @b[6], @b[1] + + veor @b[1], @b[1], @b[5] + veor @b[5], @b[5], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[2], @b[2], @b[5] + + veor @b[4], @b[4], @b[7] +___ +} + +sub InvSbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InvInBasisChange (@b); + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); +} + +sub InvInBasisChange { # OutBasisChange in reverse (with twist) +my @b=@_[5,1,2,6,3,7,0,4]; +$code.=<<___ + veor @b[1], @b[1], @b[7] + veor @b[4], @b[4], @b[7] + + veor @b[7], @b[7], @b[5] + veor @b[1], @b[1], @b[3] + veor @b[2], @b[2], @b[5] + veor @b[3], @b[3], @b[7] + + veor @b[6], @b[6], @b[1] + veor @b[2], @b[2], @b[0] + veor @b[5], @b[5], @b[3] + veor @b[4], @b[4], @b[6] + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] +___ +} + +sub InvOutBasisChange { # InBasisChange in reverse +my @b=@_[2,5,7,3,6,1,0,4]; +$code.=<<___; + veor @b[1], @b[1], @b[5] + veor @b[2], @b[2], @b[7] + + veor @b[3], @b[3], @b[1] + veor @b[4], @b[4], @b[5] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[5], @b[5], @b[0] + veor @b[3], @b[3], @b[7] + veor @b[6], @b[6], @b[2] + veor @b[2], @b[2], @b[1] + veor @b[6], @b[6], @b[3] + + veor @b[3], @b[3], @b[0] + veor @b[5], @b[5], @b[6] +___ +} + +sub Mul_GF4 { +#;************************************************************* +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * +#;************************************************************* +my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $t1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $t1, $t0 + veor $x0, $x0, $t1 +___ +} + +sub Mul_GF4_N { # not used, see next subroutine +# multiply and scale by N +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $x1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $x1, $x0 + veor $x0, $x0, $t0 +___ +} + +sub Mul_GF4_N_GF4 { +# interleaved Mul_GF4_N and Mul_GF4 +my ($x0,$x1,$y0,$y1,$t0, + $x2,$x3,$y2,$y3,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + veor $t1, $y2, $y3 + vand $t0, $t0, $x0 + vand $t1, $t1, $x2 + veor $x0, $x0, $x1 + veor $x2, $x2, $x3 + vand $x1, $x1, $y0 + vand $x3, $x3, $y2 + vand $x0, $x0, $y1 + vand $x2, $x2, $y3 + veor $x1, $x1, $x0 + veor $x2, $x2, $x3 + veor $x0, $x0, $t0 + veor $x3, $x3, $t1 +___ +} +sub Mul_GF16_2 { +my @x=@_[0..7]; +my @y=@_[8..11]; +my @t=@_[12..15]; +$code.=<<___; + veor @t[0], @x[0], @x[2] + veor @t[1], @x[1], @x[3] +___ + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[2], @x[3], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @x[0], @x[0], @t[0] + veor @x[2], @x[2], @t[0] + veor @x[1], @x[1], @t[1] + veor @x[3], @x[3], @t[1] + + veor @t[0], @x[4], @x[6] + veor @t[1], @x[5], @x[7] +___ + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[6], @x[7], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @x[4], @x[4], @t[0] + veor @x[6], @x[6], @t[0] + veor @x[5], @x[5], @t[1] + veor @x[7], @x[7], @t[1] +___ +} +sub Inv_GF256 { +#;******************************************************************** +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * +#;******************************************************************** +my @x=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; +# direct optimizations from hardware +$code.=<<___; + veor @t[3], @x[4], @x[6] + veor @t[2], @x[5], @x[7] + veor @t[1], @x[1], @x[3] + veor @s[1], @x[7], @x[6] + vmov @t[0], @t[2] + veor @s[0], @x[0], @x[2] + + vorr @t[2], @t[2], @t[1] + veor @s[3], @t[3], @t[0] + vand @s[2], @t[3], @s[0] + vorr @t[3], @t[3], @s[0] + veor @s[0], @s[0], @t[1] + vand @t[0], @t[0], @t[1] + veor @t[1], @x[3], @x[2] + vand @s[3], @s[3], @s[0] + vand @s[1], @s[1], @t[1] + veor @t[1], @x[4], @x[5] + veor @s[0], @x[1], @x[0] + veor @t[3], @t[3], @s[1] + veor @t[2], @t[2], @s[1] + vand @s[1], @t[1], @s[0] + vorr @t[1], @t[1], @s[0] + veor @t[3], @t[3], @s[3] + veor @t[0], @t[0], @s[1] + veor @t[2], @t[2], @s[2] + veor @t[1], @t[1], @s[3] + veor @t[0], @t[0], @s[2] + vand @s[0], @x[7], @x[3] + veor @t[1], @t[1], @s[2] + vand @s[1], @x[6], @x[2] + vand @s[2], @x[5], @x[1] + vorr @s[3], @x[4], @x[0] + veor @t[3], @t[3], @s[0] + veor @t[1], @t[1], @s[2] + veor @t[0], @t[0], @s[3] + veor @t[2], @t[2], @s[1] + + @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + + @ new smaller inversion + + vand @s[2], @t[3], @t[1] + vmov @s[0], @t[0] + + veor @s[1], @t[2], @s[2] + veor @s[3], @t[0], @s[2] + veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] + + vbsl @s[1], @t[1], @t[0] + vbsl @s[3], @t[3], @t[2] + veor @t[3], @t[3], @t[2] + + vbsl @s[0], @s[1], @s[2] + vbsl @t[0], @s[2], @s[1] + + vand @s[2], @s[0], @s[3] + veor @t[1], @t[1], @t[0] + + veor @s[2], @s[2], @t[3] +___ +# output in s3, s2, s1, t1 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); + +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb +} + +# AES linear components + +sub ShiftRows { +my @x=@_[0..7]; +my @t=@_[8..11]; +my $mask=pop; +$code.=<<___; + vldmia $key!, {@t[0]-@t[3]} + veor @t[0], @t[0], @x[0] + veor @t[1], @t[1], @x[1] + vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` + vldmia $key!, {@t[0]} + veor @t[2], @t[2], @x[2] + vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` + vldmia $key!, {@t[1]} + veor @t[3], @t[3], @x[3] + vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` + vldmia $key!, {@t[2]} + vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` + vldmia $key!, {@t[3]} + veor @t[0], @t[0], @x[4] + veor @t[1], @t[1], @x[5] + vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` + veor @t[2], @t[2], @x[6] + vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` + veor @t[3], @t[3], @x[7] + vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` + vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` +___ +} + +sub MixColumns { +# modified to emit output in order suitable for feeding back to aesenc[last] +my @x=@_[0..7]; +my @t=@_[8..15]; +my $inv=@_[16]; # optional +$code.=<<___; + vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) + vext.8 @t[2], @x[2], @x[2], #12 + veor @x[1], @x[1], @t[1] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[2], @x[2], @t[2] + vext.8 @t[4], @x[4], @x[4], #12 + veor @x[3], @x[3], @t[3] + vext.8 @t[5], @x[5], @x[5], #12 + veor @x[4], @x[4], @t[4] + vext.8 @t[6], @x[6], @x[6], #12 + veor @x[5], @x[5], @t[5] + vext.8 @t[7], @x[7], @x[7], #12 + veor @x[6], @x[6], @t[6] + + veor @t[1], @t[1], @x[0] + veor @x[7], @x[7], @t[7] + vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor @t[2], @t[2], @x[1] + veor @t[0], @t[0], @x[7] + veor @t[1], @t[1], @x[7] + vext.8 @x[1], @x[1], @x[1], #8 + veor @t[5], @t[5], @x[4] + veor @x[0], @x[0], @t[0] + veor @t[6], @t[6], @x[5] + veor @x[1], @x[1], @t[1] + vext.8 @t[0], @x[4], @x[4], #8 + veor @t[4], @t[4], @x[3] + vext.8 @t[1], @x[5], @x[5], #8 + veor @t[7], @t[7], @x[6] + vext.8 @x[4], @x[3], @x[3], #8 + veor @t[3], @t[3], @x[2] + vext.8 @x[5], @x[7], @x[7], #8 + veor @t[4], @t[4], @x[7] + vext.8 @x[3], @x[6], @x[6], #8 + veor @t[3], @t[3], @x[7] + vext.8 @x[6], @x[2], @x[2], #8 + veor @x[7], @t[1], @t[5] +___ +$code.=<<___ if (!$inv); + veor @x[2], @t[0], @t[4] + veor @x[4], @x[4], @t[3] + veor @x[5], @x[5], @t[7] + veor @x[3], @x[3], @t[6] + @ vmov @x[2], @t[0] + veor @x[6], @x[6], @t[2] + @ vmov @x[7], @t[1] +___ +$code.=<<___ if ($inv); + veor @t[3], @t[3], @x[4] + veor @x[5], @x[5], @t[7] + veor @x[2], @x[3], @t[6] + veor @x[3], @t[0], @t[4] + veor @x[4], @x[6], @t[2] + vmov @x[6], @t[3] + @ vmov @x[7], @t[1] +___ +} + +sub InvMixColumns_orig { +my @x=@_[0..7]; +my @t=@_[8..15]; + +$code.=<<___; + @ multiplication by 0x0e + vext.8 @t[7], @x[7], @x[7], #12 + vmov @t[2], @x[2] + veor @x[2], @x[2], @x[5] @ 2 5 + veor @x[7], @x[7], @x[5] @ 7 5 + vext.8 @t[0], @x[0], @x[0], #12 + vmov @t[5], @x[5] + veor @x[5], @x[5], @x[0] @ 5 0 [1] + veor @x[0], @x[0], @x[1] @ 0 1 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[1], @x[1], @x[2] @ 1 25 + veor @x[0], @x[0], @x[6] @ 01 6 [2] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[1], @x[1], @x[3] @ 125 3 [4] + veor @x[2], @x[2], @x[0] @ 25 016 [3] + veor @x[3], @x[3], @x[7] @ 3 75 + veor @x[7], @x[7], @x[6] @ 75 6 [0] + vext.8 @t[6], @x[6], @x[6], #12 + vmov @t[4], @x[4] + veor @x[6], @x[6], @x[4] @ 6 4 + veor @x[4], @x[4], @x[3] @ 4 375 [6] + veor @x[3], @x[3], @x[7] @ 375 756=36 + veor @x[6], @x[6], @t[5] @ 64 5 [7] + veor @x[3], @x[3], @t[2] @ 36 2 + vext.8 @t[5], @t[5], @t[5], #12 + veor @x[3], @x[3], @t[4] @ 362 4 [5] +___ + my @y = @x[7,5,0,2,1,3,4,6]; +$code.=<<___; + @ multiplication by 0x0b + veor @y[1], @y[1], @y[0] + veor @y[0], @y[0], @t[0] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[5] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[1], @y[1], @t[6] + veor @y[0], @y[0], @t[7] + veor @t[7], @t[7], @t[6] @ clobber t[7] + + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @y[0] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[2], @y[2], @t[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + veor @y[2], @y[2], @t[2] + veor @y[3], @y[3], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[7] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[3], @y[3], @t[3] + veor @y[6], @y[6], @t[3] + veor @y[4], @y[4], @t[3] + veor @y[7], @y[7], @t[4] + vext.8 @t[3], @t[3], @t[3], #12 + veor @y[5], @y[5], @t[4] + veor @y[7], @y[7], @t[7] + veor @t[7], @t[7], @t[5] @ clobber t[7] even more + veor @y[3], @y[3], @t[5] + veor @y[4], @y[4], @t[4] + + veor @y[5], @y[5], @t[7] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[6], @y[6], @t[7] + veor @y[4], @y[4], @t[7] + + veor @t[7], @t[7], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + + @ multiplication by 0x0d + veor @y[4], @y[4], @y[7] + veor @t[7], @t[7], @t[6] @ restore t[7] + veor @y[7], @y[7], @t[4] + vext.8 @t[6], @t[6], @t[6], #12 + veor @y[2], @y[2], @t[0] + veor @y[7], @y[7], @t[5] + vext.8 @t[7], @t[7], @t[7], #12 + veor @y[2], @y[2], @t[2] + + veor @y[3], @y[3], @y[1] + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[0] + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @t[5] + veor @y[0], @y[0], @t[5] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[1], @y[1], @t[7] + veor @y[0], @y[0], @t[6] + veor @y[3], @y[3], @y[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + + veor @y[7], @y[7], @t[7] + veor @y[4], @y[4], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[6] + veor @t[6], @t[6], @t[3] @ clobber t[6] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[4], @y[4], @y[7] + veor @y[3], @y[3], @t[6] + + veor @y[6], @y[6], @t[6] + veor @y[5], @y[5], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + veor @y[6], @y[6], @t[4] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[5], @y[5], @t[6] + veor @y[6], @y[6], @t[7] + vext.8 @t[7], @t[7], @t[7], #12 + veor @t[6], @t[6], @t[3] @ restore t[6] + vext.8 @t[3], @t[3], @t[3], #12 + + @ multiplication by 0x09 + veor @y[4], @y[4], @y[1] + veor @t[1], @t[1], @y[1] @ t[1]=y[1] + veor @t[0], @t[0], @t[5] @ clobber t[0] + vext.8 @t[6], @t[6], @t[6], #12 + veor @t[1], @t[1], @t[5] + veor @y[3], @y[3], @t[0] + veor @t[0], @t[0], @y[0] @ t[0]=y[0] + veor @t[1], @t[1], @t[6] + veor @t[6], @t[6], @t[7] @ clobber t[6] + veor @y[4], @y[4], @t[1] + veor @y[7], @y[7], @t[4] + veor @y[6], @y[6], @t[3] + veor @y[5], @y[5], @t[2] + veor @t[4], @t[4], @y[4] @ t[4]=y[4] + veor @t[3], @t[3], @y[3] @ t[3]=y[3] + veor @t[5], @t[5], @y[5] @ t[5]=y[5] + veor @t[2], @t[2], @y[2] @ t[2]=y[2] + veor @t[3], @t[3], @t[7] + veor @XMM[5], @t[5], @t[6] + veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] + veor @XMM[2], @t[2], @t[6] + veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] + + vmov @XMM[0], @t[0] + vmov @XMM[1], @t[1] + @ vmov @XMM[2], @t[2] + vmov @XMM[3], @t[3] + vmov @XMM[4], @t[4] + @ vmov @XMM[5], @t[5] + @ vmov @XMM[6], @t[6] + @ vmov @XMM[7], @t[7] +___ +} + +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +# Thanks to Jussi Kivilinna for providing pointer to +# +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + +$code.=<<___; + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 @t[0], @x[0], @x[0], #8 + vext.8 @t[6], @x[6], @x[6], #8 + vext.8 @t[7], @x[7], @x[7], #8 + veor @t[0], @t[0], @x[0] + vext.8 @t[1], @x[1], @x[1], #8 + veor @t[6], @t[6], @x[6] + vext.8 @t[2], @x[2], @x[2], #8 + veor @t[7], @t[7], @x[7] + vext.8 @t[3], @x[3], @x[3], #8 + veor @t[1], @t[1], @x[1] + vext.8 @t[4], @x[4], @x[4], #8 + veor @t[2], @t[2], @x[2] + vext.8 @t[5], @x[5], @x[5], #8 + veor @t[3], @t[3], @x[3] + veor @t[4], @t[4], @x[4] + veor @t[5], @t[5], @x[5] + + veor @x[0], @x[0], @t[6] + veor @x[1], @x[1], @t[6] + veor @x[2], @x[2], @t[0] + veor @x[4], @x[4], @t[2] + veor @x[3], @x[3], @t[1] + veor @x[1], @x[1], @t[7] + veor @x[2], @x[2], @t[7] + veor @x[4], @x[4], @t[6] + veor @x[5], @x[5], @t[3] + veor @x[3], @x[3], @t[6] + veor @x[6], @x[6], @t[4] + veor @x[4], @x[4], @t[7] + veor @x[5], @x[5], @t[7] + veor @x[7], @x[7], @t[5] +___ + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 +} + +sub swapmove { +my ($a,$b,$n,$mask,$t)=@_; +$code.=<<___; + vshr.u64 $t, $b, #$n + veor $t, $t, $a + vand $t, $t, $mask + veor $a, $a, $t + vshl.u64 $t, $t, #$n + veor $b, $b, $t +___ +} +sub swapmove2x { +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; +$code.=<<___; + vshr.u64 $t0, $b0, #$n + vshr.u64 $t1, $b1, #$n + veor $t0, $t0, $a0 + veor $t1, $t1, $a1 + vand $t0, $t0, $mask + vand $t1, $t1, $mask + veor $a0, $a0, $t0 + vshl.u64 $t0, $t0, #$n + veor $a1, $a1, $t1 + vshl.u64 $t1, $t1, #$n + veor $b0, $b0, $t0 + veor $b1, $b1, $t1 +___ +} + +sub bitslice { +my @x=reverse(@_[0..7]); +my ($t0,$t1,$t2,$t3)=@_[8..11]; +$code.=<<___; + vmov.i8 $t0,#0x55 @ compose .LBS0 + vmov.i8 $t1,#0x33 @ compose .LBS1 +___ + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); +$code.=<<___; + vmov.i8 $t0,#0x0f @ compose .LBS2 +___ + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" + +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#ifdef __thumb2__ +.thumb +#else +.code 32 +#endif + +.type _bsaes_decrypt8,%function +.align 4 +_bsaes_decrypt8: + adr $const,_bsaes_decrypt8 + vldmia $key!, {@XMM[9]} @ round 0 key + add $const,$const,#.LM0ISR-_bsaes_decrypt8 + + vldmia $const!, {@XMM[8]} @ .LM0ISR + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key + veor @XMM[11], @XMM[1], @XMM[9] + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` + veor @XMM[12], @XMM[2], @XMM[9] + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` + veor @XMM[13], @XMM[3], @XMM[9] + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` + veor @XMM[14], @XMM[4], @XMM[9] + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` + veor @XMM[15], @XMM[5], @XMM[9] + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` + veor @XMM[10], @XMM[6], @XMM[9] + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` + veor @XMM[11], @XMM[7], @XMM[9] + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + sub $rounds,$rounds,#1 + b .Ldec_sbox +.align 4 +.Ldec_loop: +___ + &ShiftRows (@XMM[0..7, 8..12]); +$code.=".Ldec_sbox:\n"; + &InvSbox (@XMM[0..7, 8..15]); +$code.=<<___; + subs $rounds,$rounds,#1 + bcc .Ldec_done +___ + &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); +$code.=<<___; + vldmia $const, {@XMM[12]} @ .LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq $const,$const,#0x10 + bne .Ldec_loop + vldmia $const, {@XMM[12]} @ .LISRM0 + b .Ldec_loop +.align 4 +.Ldec_done: +___ + &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); +$code.=<<___; + vldmia $key, {@XMM[8]} @ last round key + veor @XMM[6], @XMM[6], @XMM[8] + veor @XMM[4], @XMM[4], @XMM[8] + veor @XMM[2], @XMM[2], @XMM[8] + veor @XMM[7], @XMM[7], @XMM[8] + veor @XMM[3], @XMM[3], @XMM[8] + veor @XMM[5], @XMM[5], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] + veor @XMM[1], @XMM[1], @XMM[8] + bx lr +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR: @ InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR: @ ShiftRows constants + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: + .quad 0x090d01050c000408, 0x03070b0f060a0e02 +.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr $const,_bsaes_encrypt8 + vldmia $key!, {@XMM[9]} @ round 0 key + sub $const,$const,#_bsaes_encrypt8-.LM0SR + + vldmia $const!, {@XMM[8]} @ .LM0SR +_bsaes_encrypt8_alt: + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key + veor @XMM[11], @XMM[1], @XMM[9] + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` + veor @XMM[12], @XMM[2], @XMM[9] + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` + veor @XMM[13], @XMM[3], @XMM[9] + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` + veor @XMM[14], @XMM[4], @XMM[9] + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` + veor @XMM[15], @XMM[5], @XMM[9] + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` + veor @XMM[10], @XMM[6], @XMM[9] + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` + veor @XMM[11], @XMM[7], @XMM[9] + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` +_bsaes_encrypt8_bitslice: +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + sub $rounds,$rounds,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: +___ + &ShiftRows (@XMM[0..7, 8..12]); +$code.=".Lenc_sbox:\n"; + &Sbox (@XMM[0..7, 8..15]); +$code.=<<___; + subs $rounds,$rounds,#1 + bcc .Lenc_done +___ + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); +$code.=<<___; + vldmia $const, {@XMM[12]} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq $const,$const,#0x10 + bne .Lenc_loop + vldmia $const, {@XMM[12]} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: +___ + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); +$code.=<<___; + vldmia $key, {@XMM[8]} @ last round key + veor @XMM[4], @XMM[4], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[8] + veor @XMM[3], @XMM[3], @XMM[8] + veor @XMM[7], @XMM[7], @XMM[8] + veor @XMM[2], @XMM[2], @XMM[8] + veor @XMM[5], @XMM[5], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] + veor @XMM[1], @XMM[1], @XMM[8] + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +___ +} +{ +my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); + +sub bitslice_key { +my @x=reverse(@_[0..7]); +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; + + &swapmove (@x[0,1],1,$bs0,$t2,$t3); +$code.=<<___; + @ &swapmove(@x[2,3],1,$t0,$t2,$t3); + vmov @x[2], @x[0] + vmov @x[3], @x[1] +___ + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); + + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); +$code.=<<___; + @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + vmov @x[4], @x[0] + vmov @x[6], @x[2] + vmov @x[5], @x[1] + vmov @x[7], @x[3] +___ + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); +} + +$code.=<<___; +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr $const,_bsaes_key_convert + vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key + sub $const,$const,#_bsaes_key_convert-.LM0 + vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key + + vmov.i8 @XMM[8], #0x01 @ bit masks + vmov.i8 @XMM[9], #0x02 + vmov.i8 @XMM[10], #0x04 + vmov.i8 @XMM[11], #0x08 + vmov.i8 @XMM[12], #0x10 + vmov.i8 @XMM[13], #0x20 + vldmia $const, {@XMM[14]} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 @XMM[7], @XMM[7] + vrev32.8 @XMM[15], @XMM[15] +#endif + sub $rounds,$rounds,#1 + vstmia $out!, {@XMM[7]} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` + vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` + vmov.i8 @XMM[6], #0x40 + vmov.i8 @XMM[15], #0x80 + + vtst.8 @XMM[0], @XMM[7], @XMM[8] + vtst.8 @XMM[1], @XMM[7], @XMM[9] + vtst.8 @XMM[2], @XMM[7], @XMM[10] + vtst.8 @XMM[3], @XMM[7], @XMM[11] + vtst.8 @XMM[4], @XMM[7], @XMM[12] + vtst.8 @XMM[5], @XMM[7], @XMM[13] + vtst.8 @XMM[6], @XMM[7], @XMM[6] + vtst.8 @XMM[7], @XMM[7], @XMM[15] + vld1.8 {@XMM[15]}, [$inp]! @ load next round key + vmvn @XMM[0], @XMM[0] @ "pnot" + vmvn @XMM[1], @XMM[1] + vmvn @XMM[5], @XMM[5] + vmvn @XMM[6], @XMM[6] +#ifdef __ARMEL__ + vrev32.8 @XMM[15], @XMM[15] +#endif + subs $rounds,$rounds,#1 + vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 @XMM[7],#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +___ +} + +if (0) { # following four functions are unsupported interface + # used for benchmarking... +$code.=<<___; +.globl bsaes_enc_key_convert +.type bsaes_enc_key_convert,%function +.align 4 +bsaes_enc_key_convert: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + + ldr r5,[$inp,#240] @ pass rounds + mov r4,$inp @ pass key + mov r12,$out @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_enc_key_convert,.-bsaes_enc_key_convert + +.globl bsaes_encrypt_128 +.type bsaes_encrypt_128,%function +.align 4 +bsaes_encrypt_128: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so +.Lenc128_loop: + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! + mov r4,$key @ pass the key + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5,#10 @ pass rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + + bl _bsaes_encrypt8 + + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + subs $len,$len,#0x80 + vst1.8 {@XMM[5]}, [$out]! + bhi .Lenc128_loop + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_encrypt_128,.-bsaes_encrypt_128 + +.globl bsaes_dec_key_convert +.type bsaes_dec_key_convert,%function +.align 4 +bsaes_dec_key_convert: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + + ldr r5,[$inp,#240] @ pass rounds + mov r4,$inp @ pass key + mov r12,$out @ pass key schedule + bl _bsaes_key_convert + vldmia $out, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia $out, {@XMM[7]} + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_dec_key_convert,.-bsaes_dec_key_convert + +.globl bsaes_decrypt_128 +.type bsaes_decrypt_128,%function +.align 4 +bsaes_decrypt_128: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so +.Ldec128_loop: + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! + mov r4,$key @ pass the key + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5,#10 @ pass rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + + bl _bsaes_decrypt8 + + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + subs $len,$len,#0x80 + vst1.8 {@XMM[5]}, [$out]! + bhi .Ldec128_loop + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_decrypt_128,.-bsaes_decrypt_128 +___ +} +{ +my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); +my ($keysched)=("sp"); + +$code.=<<___; +.extern AES_cbc_encrypt +.extern AES_decrypt + +.global bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,%function +.align 5 +bsaes_cbc_encrypt: +#ifndef __KERNEL__ + cmp $len, #128 +#ifndef __thumb__ + blo AES_cbc_encrypt +#else + bhs 1f + b AES_cbc_encrypt +1: +#endif +#endif + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr $ivp, [ip] @ IV is 1st arg on the stack + mov $len, $len, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov $fp, sp @ save sp + + ldr $rounds, [$key, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 @ sp is $keysched + bl _bsaes_key_convert + vldmia $keysched, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia $keysched, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: +#endif + + vld1.8 {@XMM[15]}, [$ivp] @ load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs $len, $len, #0x8 + bmi .Lcbc_dec_loop_finish + + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5, $rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp] + sub $inp, $inp, #0x60 + vstmia $fp, {@XMM[15]} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[3], @XMM[3], @XMM[13] + vst1.8 {@XMM[6]}, [$out]! + veor @XMM[5], @XMM[5], @XMM[14] + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + vst1.8 {@XMM[5]}, [$out]! + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds $len, $len, #8 + beq .Lcbc_dec_done + + vld1.8 {@XMM[0]}, [$inp]! @ load input + cmp $len, #2 + blo .Lcbc_dec_one + vld1.8 {@XMM[1]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif + mov r5, $rounds + vstmia $fp, {@XMM[15]} @ put aside IV + beq .Lcbc_dec_two + vld1.8 {@XMM[2]}, [$inp]! + cmp $len, #4 + blo .Lcbc_dec_three + vld1.8 {@XMM[3]}, [$inp]! + beq .Lcbc_dec_four + vld1.8 {@XMM[4]}, [$inp]! + cmp $len, #6 + blo .Lcbc_dec_five + vld1.8 {@XMM[5]}, [$inp]! + beq .Lcbc_dec_six + vld1.8 {@XMM[6]}, [$inp]! + sub $inp, $inp, #0x70 + + bl _bsaes_decrypt8 + + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[3], @XMM[3], @XMM[13] + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub $inp, $inp, #0x60 + bl _bsaes_decrypt8 + vldmia $fp,{@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub $inp, $inp, #0x50 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[2], @XMM[2], @XMM[11] + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub $inp, $inp, #0x40 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub $inp, $inp, #0x30 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub $inp, $inp, #0x20 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[15]}, [$inp]! @ reload input + veor @XMM[1], @XMM[1], @XMM[8] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub $inp, $inp, #0x10 + mov $rounds, $out @ save original out pointer + mov $out, $fp @ use the iv scratch space as out buffer + mov r2, $key + vmov @XMM[4],@XMM[15] @ just in case ensure that IV + vmov @XMM[5],@XMM[0] @ and input are preserved + bl AES_decrypt + vld1.8 {@XMM[0]}, [$fp,:64] @ load result + veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV + vmov @XMM[15], @XMM[5] @ @XMM[5] holds input + vst1.8 {@XMM[0]}, [$rounds] @ write output + +.Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +.Lcbc_dec_bzero: @ wipe key schedule [if any] + vstmia $keysched!, {q0-q1} + cmp $keysched, $fp + bne .Lcbc_dec_bzero +#endif + + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + vst1.8 {@XMM[15]}, [$ivp] @ return IV + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +___ +} +{ +my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); +my $const = "r6"; # shared with _bsaes_encrypt8_alt +my $keysched = "sp"; + +$code.=<<___; +.extern AES_encrypt +.global bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + cmp $len, #8 @ use plain AES for + blo .Lctr_enc_short @ small sizes + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr $ctr, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov $fp, sp @ save sp + + ldr $rounds, [$key, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 @ sp is $keysched + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + + vld1.8 {@XMM[0]}, [$ctr] @ load counter + add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr + vldmia $keysched, {@XMM[4]} @ load round0 key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + +.align 2 +0: add r12, $key, #248 + vld1.8 {@XMM[0]}, [$ctr] @ load counter + adrl $ctr, .LREVM0SR @ borrow $ctr + vldmia r12, {@XMM[4]} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 @XMM[8],#1 @ compose 1<<96 + veor @XMM[9],@XMM[9],@XMM[9] + vrev32.8 @XMM[0],@XMM[0] + vext.8 @XMM[8],@XMM[9],@XMM[8],#4 + vrev32.8 @XMM[4],@XMM[4] + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vstmia $keysched, {@XMM[4]} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 + vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 + vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 + vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 + vadd.u32 @XMM[4], @XMM[1], @XMM[10] + vadd.u32 @XMM[5], @XMM[2], @XMM[10] + vadd.u32 @XMM[6], @XMM[3], @XMM[10] + vadd.u32 @XMM[7], @XMM[4], @XMM[10] + vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia $keysched, {@XMM[9]} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, $keysched, #0x10 @ pass next round key +#else + add r4, $key, #`248+16` +#endif + vldmia $ctr, {@XMM[8]} @ .LREVM0SR + mov r5, $rounds @ pass rounds + vstmia $fp, {@XMM[10]} @ save next counter + sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants + + bl _bsaes_encrypt8_alt + + subs $len, $len, #8 + blo .Lctr_enc_loop_done + + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[0], @XMM[8] + veor @XMM[1], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[10] + veor @XMM[6], @XMM[11] + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[7], @XMM[13] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[4]}, [$out]! + veor @XMM[5], @XMM[15] + vst1.8 {@XMM[6]}, [$out]! + vmov.i32 @XMM[8], #1 @ compose 1<<96 + vst1.8 {@XMM[3]}, [$out]! + veor @XMM[9], @XMM[9], @XMM[9] + vst1.8 {@XMM[7]}, [$out]! + vext.8 @XMM[8], @XMM[9], @XMM[8], #4 + vst1.8 {@XMM[2]}, [$out]! + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vst1.8 {@XMM[5]}, [$out]! + vldmia $fp, {@XMM[0]} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add $len, $len, #8 + vld1.8 {@XMM[8]}, [$inp]! @ load input + veor @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! @ write output + cmp $len, #2 + blo .Lctr_enc_done + vld1.8 {@XMM[9]}, [$inp]! + veor @XMM[1], @XMM[9] + vst1.8 {@XMM[1]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[10]}, [$inp]! + veor @XMM[4], @XMM[10] + vst1.8 {@XMM[4]}, [$out]! + cmp $len, #4 + blo .Lctr_enc_done + vld1.8 {@XMM[11]}, [$inp]! + veor @XMM[6], @XMM[11] + vst1.8 {@XMM[6]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[12]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[3]}, [$out]! + cmp $len, #6 + blo .Lctr_enc_done + vld1.8 {@XMM[13]}, [$inp]! + veor @XMM[7], @XMM[13] + vst1.8 {@XMM[7]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[14]}, [$inp] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[2]}, [$out]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero: @ wipe key schedule [if any] + vstmia $keysched!, {q0-q1} + cmp $keysched, $fp + bne .Lctr_enc_bzero +#else + vstmia $keysched, {q0-q1} +#endif + + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.align 4 +.Lctr_enc_short: + ldr ip, [sp] @ ctr pointer is passed on stack + stmdb sp!, {r4-r8, lr} + + mov r4, $inp @ copy arguments + mov r5, $out + mov r6, $len + mov r7, $key + ldr r8, [ip, #12] @ load counter LSW + vld1.8 {@XMM[1]}, [ip] @ load whole counter value +#ifdef __ARMEL__ + rev r8, r8 +#endif + sub sp, sp, #0x10 + vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + sub sp, sp, #0x10 + +.Lctr_enc_short_loop: + add r0, sp, #0x10 @ input counter value + mov r1, sp @ output on the stack + mov r2, r7 @ key + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [r4]! @ load input + vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + add r8, r8, #1 +#ifdef __ARMEL__ + rev r0, r8 + str r0, [sp, #0x1c] @ next counter value +#else + str r8, [sp, #0x1c] @ next counter value +#endif + veor @XMM[0],@XMM[0],@XMM[1] + vst1.8 {@XMM[0]}, [r5]! @ store output + subs r6, r6, #1 + bne .Lctr_enc_short_loop + + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vstmia sp!, {q0-q1} + + ldmia sp!, {r4-r8, pc} +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +___ +} +{ +###################################################################### +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3))); +my $const="r6"; # returned by _bsaes_key_convert +my $twmask=@XMM[5]; +my @T=@XMM[6..7]; + +$code.=<<___; +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,%function +.align 4 +bsaes_xts_encrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future $fp + + mov $inp, r0 + mov $out, r1 + mov $len, r2 + mov $key, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0,sp @ pointer to initial tweak +#endif + + ldr $rounds, [$key, #240] @ get # of rounds + mov $fp, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + + vld1.8 {@XMM[8]}, [r0] @ initial tweak + adr $magic, .Lxts_magic + + subs $len, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + vadd.u64 @XMM[8], @XMM[15], @XMM[15] + vst1.64 {@XMM[15]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + veor @XMM[8], @XMM[8], @T[0] + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + veor @XMM[7], @XMM[7], @XMM[15] + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[2], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + veor @XMM[13], @XMM[5], @XMM[15] + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + subs $len, #0x80 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds $len, #0x70 + bmi .Lxts_enc_done + + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! + subs $len, #0x10 + bmi .Lxts_enc_`$i-9` +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + sub $len, #0x10 + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vld1.64 {@XMM[14]}, [r0,:128]! + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[2], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + vst1.8 {@XMM[12]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_6: + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak + + veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[5], @XMM[5], @XMM[13] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done + +@ put this in range for both ARM and Thumb mode adr instructions +.align 5 +.Lxts_magic: + .quad 1, 0x87 + +.align 5 +.Lxts_enc_5: + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak + + veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[4], @XMM[4], @XMM[12] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + vst1.8 {@XMM[10]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_4: + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak + + veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[3], @XMM[3], @XMM[11] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_3: + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak + + veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[2], @XMM[2], @XMM[10] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + vld1.64 {@XMM[10]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + vst1.8 {@XMM[8]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_2: + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak + + veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[1], @XMM[1], @XMM[9] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_1: + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! + mov $fp, r4 + + vmov @XMM[8], @XMM[9] @ next round tweak + +.Lxts_enc_done: +#ifndef XTS_CHAIN_TWEAK + adds $len, #0x10 + beq .Lxts_enc_ret + sub r6, $out, #0x10 + +.Lxts_enc_steal: + ldrb r0, [$inp], #1 + ldrb r1, [$out, #-0x10] + strb r0, [$out, #-0x10] + strb r1, [$out], #1 + + subs $len, #1 + bhi .Lxts_enc_steal + + vld1.8 {@XMM[0]}, [r6] + mov r0, sp + veor @XMM[0], @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [r6] + mov $fp, r4 +#endif + +.Lxts_enc_ret: + bic r0, $fp, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_enc_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_enc_bzero + + mov sp, $fp +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,%function +.align 4 +bsaes_xts_decrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future $fp + + mov $inp, r0 + mov $out, r1 + mov $len, r2 + mov $key, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0, sp @ pointer to initial tweak +#endif + + ldr $rounds, [$key, #240] @ get # of rounds + mov $fp, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + add r4, sp, #0x90 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + vld1.8 {@XMM[8]}, [r0] @ initial tweak + adr $magic, .Lxts_magic + + tst $len, #0xf @ if not multiple of 16 + it ne @ Thumb2 thing, sanity check in ARM + subne $len, #0x10 @ subtract another 16 bytes + subs $len, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + vadd.u64 @XMM[8], @XMM[15], @XMM[15] + vst1.64 {@XMM[15]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + veor @XMM[8], @XMM[8], @T[0] + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + veor @XMM[7], @XMM[7], @XMM[15] + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[3], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + veor @XMM[13], @XMM[5], @XMM[15] + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + subs $len, #0x80 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds $len, #0x70 + bmi .Lxts_dec_done + + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! + subs $len, #0x10 + bmi .Lxts_dec_`$i-9` +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + sub $len, #0x10 + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vld1.64 {@XMM[14]}, [r0,:128]! + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[3], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + vst1.8 {@XMM[12]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_6: + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak + + veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[5], @XMM[5], @XMM[13] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_5: + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak + + veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[4], @XMM[4], @XMM[12] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + vst1.8 {@XMM[10]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_4: + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak + + veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[3], @XMM[3], @XMM[11] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_3: + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak + + veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[2], @XMM[2], @XMM[10] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + vld1.64 {@XMM[10]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + vst1.8 {@XMM[8]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_2: + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak + + veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[1], @XMM[1], @XMM[9] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_1: + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + mov r5, $magic @ preserve magic + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! + mov $fp, r4 + mov $magic, r5 + + vmov @XMM[8], @XMM[9] @ next round tweak + +.Lxts_dec_done: +#ifndef XTS_CHAIN_TWEAK + adds $len, #0x10 + beq .Lxts_dec_ret + + @ calculate one round of extra tweak for the stolen ciphertext + vldmia $magic, {$twmask} + vshr.s64 @XMM[6], @XMM[8], #63 + vand @XMM[6], @XMM[6], $twmask + vadd.u64 @XMM[9], @XMM[8], @XMM[8] + vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")` + veor @XMM[9], @XMM[9], @XMM[6] + + @ perform the final decryption with the last tweak value + vld1.8 {@XMM[0]}, [$inp]! + mov r0, sp + veor @XMM[0], @XMM[0], @XMM[9] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[9] + vst1.8 {@XMM[0]}, [$out] + + mov r6, $out +.Lxts_dec_steal: + ldrb r1, [$out] + ldrb r0, [$inp], #1 + strb r1, [$out, #0x10] + strb r0, [$out], #1 + + subs $len, #1 + bhi .Lxts_dec_steal + + vld1.8 {@XMM[0]}, [r6] + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [r6] + mov $fp, r4 +#endif + +.Lxts_dec_ret: + bic r0, $fp, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_dec_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_dec_bzero + + mov sp, $fp +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +___ +} +$code.=<<___; +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; + +close STDOUT; diff --git a/openssl/crypto/aes/asm/bsaes-x86_64.pl b/openssl/crypto/aes/asm/bsaes-x86_64.pl index 41b90f084..3f7d33c45 100644 --- a/openssl/crypto/aes/asm/bsaes-x86_64.pl +++ b/openssl/crypto/aes/asm/bsaes-x86_64.pl @@ -38,8 +38,9 @@ # Emilia's this(*) difference # # Core 2 9.30 8.69 +7% -# Nehalem(**) 7.63 6.98 +9% -# Atom 17.1 17.4 -2%(***) +# Nehalem(**) 7.63 6.88 +11% +# Atom 17.1 16.4 +4% +# Silvermont - 12.9 # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation @@ -50,14 +51,6 @@ # (**) Results were collected on Westmere, which is considered to # be equivalent to Nehalem for this code. # -# (***) Slowdown on Atom is rather strange per se, because original -# implementation has a number of 9+-bytes instructions, which -# are bad for Atom front-end, and which I eliminated completely. -# In attempt to address deterioration sbox() was tested in FP -# SIMD "domain" (movaps instead of movdqa, xorps instead of -# pxor, etc.). While it resulted in nominal 4% improvement on -# Atom, it hurted Westmere by more than 2x factor. -# # As for key schedule conversion subroutine. Interface to OpenSSL # relies on per-invocation on-the-fly conversion. This naturally # has impact on performance, especially for short inputs. Conversion @@ -67,7 +60,7 @@ # conversion conversion/8x block # Core 2 240 0.22 # Nehalem 180 0.20 -# Atom 430 0.19 +# Atom 430 0.20 # # The ratio values mean that 128-byte blocks will be processed # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, @@ -83,9 +76,10 @@ # Add decryption procedure. Performance in CPU cycles spent to decrypt # one byte out of 4096-byte buffer with 128-bit key is: # -# Core 2 9.83 -# Nehalem 7.74 -# Atom 19.0 +# Core 2 9.98 +# Nehalem 7.80 +# Atom 17.9 +# Silvermont 14.0 # # November 2011. # @@ -434,21 +428,21 @@ my $mask=pop; $code.=<<___; pxor 0x00($key),@x[0] pxor 0x10($key),@x[1] - pshufb $mask,@x[0] pxor 0x20($key),@x[2] - pshufb $mask,@x[1] pxor 0x30($key),@x[3] - pshufb $mask,@x[2] + pshufb $mask,@x[0] + pshufb $mask,@x[1] pxor 0x40($key),@x[4] - pshufb $mask,@x[3] pxor 0x50($key),@x[5] - pshufb $mask,@x[4] + pshufb $mask,@x[2] + pshufb $mask,@x[3] pxor 0x60($key),@x[6] - pshufb $mask,@x[5] pxor 0x70($key),@x[7] + pshufb $mask,@x[4] + pshufb $mask,@x[5] pshufb $mask,@x[6] - lea 0x80($key),$key pshufb $mask,@x[7] + lea 0x80($key),$key ___ } @@ -820,18 +814,18 @@ _bsaes_encrypt8: movdqa 0x50($const), @XMM[8] # .LM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] _bsaes_encrypt8_bitslice: @@ -884,18 +878,18 @@ _bsaes_decrypt8: movdqa -0x30($const), @XMM[8] # .LM0ISR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] ___ @@ -1937,21 +1931,21 @@ $code.=<<___; movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] - lea .LBS0(%rip), %r11 # constants table pshufb @XMM[8], @XMM[7] + lea .LBS0(%rip), %r11 # constants table mov %ebx,%r10d # pass rounds call _bsaes_encrypt8_bitslice diff --git a/openssl/crypto/aes/asm/vpaes-ppc.pl b/openssl/crypto/aes/asm/vpaes-ppc.pl new file mode 100755 index 000000000..7fda60ed9 --- /dev/null +++ b/openssl/crypto/aes/asm/vpaes-ppc.pl @@ -0,0 +1,1512 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +# CBC encrypt/decrypt performance in cycles per byte processed with +# 128-bit key. +# +# aes-ppc.pl this +# G4e 35.5/52.1/(23.8) 11.9(*)/15.4 +# POWER6 42.7/54.3/(28.2) 63.0/92.8(**) +# POWER7 32.3/42.9/(18.4) 18.5/23.3 +# +# (*) This is ~10% worse than reported in paper. The reason is +# twofold. This module doesn't make any assumption about +# key schedule (or data for that matter) alignment and handles +# it in-line. Secondly it, being transliterated from +# vpaes-x86_64.pl, relies on "nested inversion" better suited +# for Intel CPUs. +# (**) Inadequate POWER6 performance is due to astronomic AltiVec +# latency, 9 cycles per simple logical operation. + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; + $UCMP ="cmpld"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; +} else { die "nonsense $flavour"; } + +$sp="r1"; +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$code.=<<___; +.machine "any" + +.text + +.align 7 # totally strategic alignment +_vpaes_consts: +Lk_mc_forward: # mc_forward + .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv + .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv + .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv + .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv +Lk_mc_backward: # mc_backward + .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv + .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv + .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv + .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv +Lk_sr: # sr + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv + .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv + .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv + .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv + +## +## "Hot" constants +## +Lk_inv: # inv, inva + .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev + .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev +Lk_ipt: # input transform (lo, hi) + .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev + .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev +Lk_sbo: # sbou, sbot + .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev + .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev +Lk_sb1: # sb1u, sb1t + .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev + .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev +Lk_sb2: # sb2u, sb2t + .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev + .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev + +## +## Decryption stuff +## +Lk_dipt: # decryption input transform + .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev + .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev +Lk_dsbo: # decryption sbox final output + .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev + .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev +Lk_dsb9: # decryption sbox output *9*u, *9*t + .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev + .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev +Lk_dsbd: # decryption sbox output *D*u, *D*t + .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev + .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev +Lk_dsbb: # decryption sbox output *B*u, *B*t + .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev + .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev +Lk_dsbe: # decryption sbox output *E*u, *E*t + .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev + .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev + +## +## Key schedule constants +## +Lk_dksd: # decryption key schedule: invskew x*D + .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev + .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev +Lk_dksb: # decryption key schedule: invskew x*B + .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev + .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev +Lk_dkse: # decryption key schedule: invskew x*E + 0x63 + .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev + .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev +Lk_dks9: # decryption key schedule: invskew x*9 + .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev + .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev + +Lk_rcon: # rcon + .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis +Lk_s63: + .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis + +Lk_opt: # output transform + .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev + .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev +Lk_deskew: # deskew tables: inverts the sbox's "skew" + .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev + .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev +.align 5 +Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr r12 #vvvvv "distance between . and _vpaes_consts + addi r12,r12,-0x308 + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" +.align 6 +___ + +my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31)); +{ +my ($inp,$out,$key) = map("r$_",(3..5)); + +my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19)); +my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23)); + +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.align 4 +_vpaes_encrypt_preheat: + mflr r8 + bl Lconsts + mtlr r8 + li r11, 0xc0 # Lk_inv + li r10, 0xd0 + li r9, 0xe0 # Lk_ipt + li r8, 0xf0 + vxor v7, v7, v7 # 0x00..00 + vspltisb v8,4 # 0x04..04 + vspltisb v9,0x0f # 0x0f..0f + lvx $invlo, r12, r11 + li r11, 0x100 + lvx $invhi, r12, r10 + li r10, 0x110 + lvx $iptlo, r12, r9 + li r9, 0x120 + lvx $ipthi, r12, r8 + li r8, 0x130 + lvx $sbou, r12, r11 + li r11, 0x140 + lvx $sbot, r12, r10 + li r10, 0x150 + lvx $sb1u, r12, r9 + lvx $sb1t, r12, r8 + lvx $sb2u, r12, r11 + lvx $sb2t, r12, r10 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax +## +## +.align 5 +_vpaes_encrypt_core: + lwz r8, 240($key) # pull rounds + li r9, 16 + lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key + li r11, 0x10 + lvx v6, r9, $key + addi r9, r9, 16 + ?vperm v5, v5, v6, $keyperm # align round key + addi r10, r11, 0x40 + vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 + vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 + vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2 + vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0 + vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 + mtctr r8 + b Lenc_entry + +.align 4 +Lenc_loop: + # middle of middle round + vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + addi r11, r11, 16 + vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4 + vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + addi r10, r11, 0x40 + vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + +Lenc_entry: + # top of round + vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i + vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vand v0, v0, v9 + vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vmr v5, v6 + lvx v6, r9, $key # vmovdqu (%r9), %xmm5 + vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + addi r9, r9, 16 + vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io + ?vperm v5, v5, v6, $keyperm # align round key + vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + bdnz Lenc_loop + + # middle of last round + addi r10, r11, 0x80 + # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .vpaes_encrypt +.align 5 +.vpaes_encrypt: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r6 + mfspr r7, 256 # save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r7,`$FRAME-4`($sp) # save vrsave + li r0, -1 + $PUSH r6,`$FRAME+$LRSAVE`($sp) + mtspr 256, r0 # preserve all AltiVec registers + + bl _vpaes_encrypt_preheat + + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp + addi $inp, $inp, 15 # 15 is not a typo + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key # prepare for unaligned access + vnor $outmask, v7, v7 # 0xff..ff + lvx $inptail, 0, $inp # redundant in aligned case + ?vperm $outmask, v7, $outmask, $outperm + lvx $outhead, 0, $out + ?vperm v0, v0, $inptail, $inpperm + + bl _vpaes_encrypt_core + + vperm v0, v0, v0, $outperm # rotate right/left + vsel v1, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v1, 0, $out + addi $out, $out, 15 # 15 is not a typo + ######## + + lvx v1, 0, $out # redundant in aligned case + vsel v1, $outhead, v1, $outmask + stvx v1, 0, $out + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtlr r6 + mtspr 256, r7 # restore vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,3,0 + .long 0 +.size .vpaes_encrypt,.-.vpaes_encrypt + +.align 4 +_vpaes_decrypt_preheat: + mflr r8 + bl Lconsts + mtlr r8 + li r11, 0xc0 # Lk_inv + li r10, 0xd0 + li r9, 0x160 # Ldipt + li r8, 0x170 + vxor v7, v7, v7 # 0x00..00 + vspltisb v8,4 # 0x04..04 + vspltisb v9,0x0f # 0x0f..0f + lvx $invlo, r12, r11 + li r11, 0x180 + lvx $invhi, r12, r10 + li r10, 0x190 + lvx $iptlo, r12, r9 + li r9, 0x1a0 + lvx $ipthi, r12, r8 + li r8, 0x1b0 + lvx $sbou, r12, r11 + li r11, 0x1c0 + lvx $sbot, r12, r10 + li r10, 0x1d0 + lvx $sb9u, r12, r9 + li r9, 0x1e0 + lvx $sb9t, r12, r8 + li r8, 0x1f0 + lvx $sbdu, r12, r11 + li r11, 0x200 + lvx $sbdt, r12, r10 + li r10, 0x210 + lvx $sbbu, r12, r9 + lvx $sbbt, r12, r8 + lvx $sbeu, r12, r11 + lvx $sbet, r12, r10 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## Decryption core +## +## Same API as encryption core. +## +.align 4 +_vpaes_decrypt_core: + lwz r8, 240($key) # pull rounds + li r9, 16 + lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key + li r11, 0x30 + lvx v6, r9, $key + addi r9, r9, 16 + ?vperm v5, v5, v6, $keyperm # align round key + vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 + vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 + vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 + vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2 + vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 + mtctr r8 + b Ldec_entry + +.align 4 +Ldec_loop: +# +# Inverse mix columns +# + lvx v0, r12, r11 # v5 and v0 are flipped + # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + subi r11, r11, 16 + vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + andi. r11, r11, 0x30 + vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 + # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + +Ldec_entry: + # top of round + vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i + vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vand v0, v0, v9 + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vmr v5, v6 + lvx v6, r9, $key # vmovdqu (%r9), %xmm0 + vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + addi r9, r9, 16 + vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io + ?vperm v5, v5, v6, $keyperm # align round key + vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + bdnz Ldec_loop + + # middle of last round + addi r10, r11, 0x80 + # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .vpaes_decrypt +.align 5 +.vpaes_decrypt: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r6 + mfspr r7, 256 # save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r7,`$FRAME-4`($sp) # save vrsave + li r0, -1 + $PUSH r6,`$FRAME+$LRSAVE`($sp) + mtspr 256, r0 # preserve all AltiVec registers + + bl _vpaes_decrypt_preheat + + ?lvsl $inpperm, 0, $inp # prepare for unaligned access + lvx v0, 0, $inp + addi $inp, $inp, 15 # 15 is not a typo + ?lvsr $outperm, 0, $out + ?lvsl $keyperm, 0, $key + vnor $outmask, v7, v7 # 0xff..ff + lvx $inptail, 0, $inp # redundant in aligned case + ?vperm $outmask, v7, $outmask, $outperm + lvx $outhead, 0, $out + ?vperm v0, v0, $inptail, $inpperm + + bl _vpaes_decrypt_core + + vperm v0, v0, v0, $outperm # rotate right/left + vsel v1, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v1, 0, $out + addi $out, $out, 15 # 15 is not a typo + ######## + + lvx v1, 0, $out # redundant in aligned case + vsel v1, $outhead, v1, $outmask + stvx v1, 0, $out + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtlr r6 + mtspr 256, r7 # restore vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,3,0 + .long 0 +.size .vpaes_decrypt,.-.vpaes_decrypt + +.globl .vpaes_cbc_encrypt +.align 5 +.vpaes_cbc_encrypt: + ${UCMP}i r5,16 + bltlr- + + $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) + mflr r0 + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mfspr r12, 256 + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r12,`$FRAME-4`($sp) # save vrsave + $PUSH r30,`$FRAME+$SIZE_T*0`($sp) + $PUSH r31,`$FRAME+$SIZE_T*1`($sp) + li r9, -16 + $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) + + and r30, r5, r9 # copy length&-16 + mr r5, r6 # copy pointer to key + mr r31, r7 # copy pointer to iv + blt Lcbc_abort + cmpwi r8, 0 # test direction + li r6, -1 + mr r7, r12 # copy vrsave + mtspr 256, r6 # preserve all AltiVec registers + + lvx v24, 0, r31 # load [potentially unaligned] iv + li r9, 15 + ?lvsl $inpperm, 0, r31 + lvx v25, r9, r31 + ?vperm v24, v24, v25, $inpperm + + neg r8, $inp # prepare for unaligned access + vxor v7, v7, v7 + ?lvsl $keyperm, 0, $key + ?lvsr $outperm, 0, $out + ?lvsr $inpperm, 0, r8 # -$inp + vnor $outmask, v7, v7 # 0xff..ff + lvx $inptail, 0, $inp + ?vperm $outmask, v7, $outmask, $outperm + addi $inp, $inp, 15 # 15 is not a typo + lvx $outhead, 0, $out + + beq Lcbc_decrypt + + bl _vpaes_encrypt_preheat + li r0, 16 + +Lcbc_enc_loop: + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vxor v0, v0, v24 # ^= iv + + bl _vpaes_encrypt_core + + vmr v24, v0 # put aside iv + sub. r30, r30, r0 # len -= 16 + vperm v0, v0, v0, $outperm # rotate right/left + vsel v1, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v1, 0, $out + addi $out, $out, 16 + bne Lcbc_enc_loop + + b Lcbc_done + +.align 5 +Lcbc_decrypt: + bl _vpaes_decrypt_preheat + li r0, 16 + +Lcbc_dec_loop: + vmr v0, $inptail + lvx $inptail, 0, $inp + addi $inp, $inp, 16 + ?vperm v0, v0, $inptail, $inpperm + vmr v25, v0 # put aside input + + bl _vpaes_decrypt_core + + vxor v0, v0, v24 # ^= iv + vmr v24, v25 + sub. r30, r30, r0 # len -= 16 + vperm v0, v0, v0, $outperm # rotate right/left + vsel v1, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v1, 0, $out + addi $out, $out, 16 + bne Lcbc_dec_loop + +Lcbc_done: + addi $out, $out, -1 + lvx v1, 0, $out # redundant in aligned case + vsel v1, $outhead, v1, $outmask + stvx v1, 0, $out + + neg r8, r31 # write [potentially unaligned] iv + ?lvsl $outperm, 0, r8 + li r6, 15 + vnor $outmask, v7, v7 # 0xff..ff + ?vperm $outmask, v7, $outmask, $outperm + lvx $outhead, 0, r31 + vperm v24, v24, v24, $outperm # rotate right/left + vsel v0, $outhead, v24, $outmask + lvx v1, r6, r31 + stvx v0, 0, r31 + vsel v1, v24, v1, $outmask + stvx v1, r6, r31 + + mtspr 256, r7 # restore vrsave + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp +Lcbc_abort: + $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) + $POP r30,`$FRAME+$SIZE_T*0`($sp) + $POP r31,`$FRAME+$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,`$FRAME+$SIZE_T*2` + blr + .long 0 + .byte 0,12,0x04,1,0x80,2,6,0 + .long 0 +.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt +___ +} +{ +my ($inp,$bits,$out)=map("r$_",(3..5)); +my $dir="cr1"; +my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24)); + +$code.=<<___; +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.align 4 +_vpaes_key_preheat: + mflr r8 + bl Lconsts + mtlr r8 + li r11, 0xc0 # Lk_inv + li r10, 0xd0 + li r9, 0xe0 # L_ipt + li r8, 0xf0 + + vspltisb v8,4 # 0x04..04 + vxor v9,v9,v9 # 0x00..00 + lvx $invlo, r12, r11 # Lk_inv + li r11, 0x120 + lvx $invhi, r12, r10 + li r10, 0x130 + lvx $iptlo, r12, r9 # Lk_ipt + li r9, 0x220 + lvx $ipthi, r12, r8 + li r8, 0x230 + + lvx v14, r12, r11 # Lk_sb1 + li r11, 0x240 + lvx v15, r12, r10 + li r10, 0x250 + + lvx v16, r12, r9 # Lk_dksd + li r9, 0x260 + lvx v17, r12, r8 + li r8, 0x270 + lvx v18, r12, r11 # Lk_dksb + li r11, 0x280 + lvx v19, r12, r10 + li r10, 0x290 + lvx v20, r12, r9 # Lk_dkse + li r9, 0x2a0 + lvx v21, r12, r8 + li r8, 0x2b0 + lvx v22, r12, r11 # Lk_dks9 + lvx v23, r12, r10 + + lvx v24, r12, r9 # Lk_rcon + lvx v25, 0, r12 # Lk_mc_forward[0] + lvx v26, r12, r8 # Lks63 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.align 4 +_vpaes_schedule_core: + mflr r7 + + bl _vpaes_key_preheat # load the tables + + #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned) + neg r8, $inp # prepare for unaligned access + lvx v0, 0, $inp + addi $inp, $inp, 15 # 15 is not typo + ?lvsr $inpperm, 0, r8 # -$inp + lvx v6, 0, $inp # v6 serves as inptail + addi $inp, $inp, 8 + ?vperm v0, v0, v6, $inpperm + + # input transform + vmr v3, v0 # vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + vmr v7, v0 # vmovdqa %xmm0, %xmm7 + + bne $dir, Lschedule_am_decrypting + + # encrypting, output zeroth round key after transform + li r8, 0x30 # mov \$0x30,%r8d + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + + ?lvsr $outperm, 0, $out # prepare for unaligned access + vnor $outmask, v9, v9 # 0xff..ff + lvx $outhead, 0, $out + ?vperm $outmask, v9, $outmask, $outperm + + #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) + vperm v1, v0, v0, $outperm # rotate right/left + vsel v2, $outhead, v1, $outmask + vmr $outhead, v1 + stvx v2, 0, $out + b Lschedule_go + +Lschedule_am_decrypting: + srwi r8, $bits, 1 # shr \$1,%r8d + andi. r8, r8, 32 # and \$32,%r8d + xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 + addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 + # decrypting, output zeroth round key after shiftrows + lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 + + neg r0, $out # prepare for unaligned access + ?lvsl $outperm, 0, r0 + addi $out, $out, 15 # 15 is not typo + vnor $outmask, v9, v9 # 0xff..ff + lvx $outhead, 0, $out + ?vperm $outmask, $outmask, v9, $outperm + + #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) + vperm v4, v4, v4, $outperm # rotate right/left + vsel v2, $outhead, v4, $outmask + vmr $outhead, v4 + stvx v2, 0, $out + xori r8, r8, 0x30 # xor \$0x30, %r8 + +Lschedule_go: + cmplwi $bits, 192 # cmp \$192, %esi + bgt Lschedule_256 + beq Lschedule_192 + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + li r0, 10 # mov \$10, %esi + mtctr r0 + +Loop_schedule_128: + bl _vpaes_schedule_round + bdz Lschedule_mangle_last # dec %esi + bl _vpaes_schedule_mangle # write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + li r0, 4 # mov \$4, %esi + lvx v0, 0, $inp + ?vperm v0, v6, v0, $inpperm + ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform # input transform + ?vsldoi v6, v0, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros + mtctr r0 + +Loop_schedule_192: + bl _vpaes_schedule_round + ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle # save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle # save key n+1 + bl _vpaes_schedule_round + bdz Lschedule_mangle_last # dec %esi + bl _vpaes_schedule_mangle # save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + li r0, 7 # mov \$7, %esi + addi $inp, $inp, 8 + lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + ?vperm v0, v6, v0, $inpperm + bl _vpaes_schedule_transform # input transform + mtctr r0 + +Loop_schedule_256: + bl _vpaes_schedule_mangle # output low result + vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + # high round + bl _vpaes_schedule_round + bdz Lschedule_mangle_last # dec %esi + bl _vpaes_schedule_mangle + + # low round. swap xmm7 and xmm6 + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + vmr v5, v7 # vmovdqa %xmm7, %xmm5 + vmr v7, v6 # vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmr v7, v5 # vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + # schedule last round key from xmm0 + li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11 + li r9, 0x2f0 + bne $dir, Lschedule_mangle_last_dec + + # encrypting + lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1 + li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform + li r9, 0x2d0 # prepare to output transform + vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute + + lvx $iptlo, r11, r12 # reload $ipt + lvx $ipthi, r9, r12 + addi $out, $out, 16 # add \$16, %rdx + vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform # output transform + + #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key + vperm v0, v0, v0, $outperm # rotate right/left + vsel v2, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v2, 0, $out + + addi $out, $out, 15 # 15 is not typo + lvx v1, 0, $out # redundant in aligned case + vsel v1, $outhead, v1, $outmask + stvx v1, 0, $out + b Lschedule_mangle_done + +.align 4 +Lschedule_mangle_last_dec: + lvx $iptlo, r11, r12 # reload $ipt + lvx $ipthi, r9, r12 + addi $out, $out, -16 # add \$-16, %rdx + vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform # output transform + + #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key + vperm v0, v0, v0, $outperm # rotate right/left + vsel v2, $outhead, v0, $outmask + vmr $outhead, v0 + stvx v2, 0, $out + + addi $out, $out, -15 # -15 is not typo + lvx v1, 0, $out # redundant in aligned case + vsel v1, $outhead, v1, $outmask + stvx v1, 0, $out + +Lschedule_mangle_done: + mtlr r7 + # cleanup + vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0 + vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1 + vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2 + vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3 + vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 + vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5 + vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6 + vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.align 4 +_vpaes_schedule_192_smear: + ?vspltw v0, v7, 3 + ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmr v0, v6 + ?vsldoi v6, v6, v9, 8 + ?vsldoi v6, v9, v6, 8 # clobber low side with zeros + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.align 4 +_vpaes_schedule_round: + # extract rcon from xmm8 + #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 + ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 + ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 + vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 + + # rotate + ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 + ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 + + # fall through... + + # low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + # smear xmm7 + ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 + vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 + vspltisb v1, 0x0f # 0x0f..0f + ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 + + # subbytes + vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k + vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i + vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7 + vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io + vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + # add in smeared stuff + vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0 + vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm2 +## +.align 4 +_vpaes_schedule_transform: + #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1 + vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 + # vmovdqa (%r11), %xmm2 # lo + vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 + # vmovdqa 16(%r11), %xmm1 # hi + vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0 + vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.align 4 +_vpaes_schedule_mangle: + #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later + # vmovdqa .Lk_mc_forward(%rip),%xmm5 + bne $dir, Lschedule_mangle_dec + + # encrypting + vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4 + addi $out, $out, 16 # add \$16, %rdx + vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4 + vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1 + vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3 + vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4 + lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3 + + vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 + addi r8, r8, -16 # add \$-16, %r8 + andi. r8, r8, 0x30 # and \$0x30, %r8 + + #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) + vperm v1, v3, v3, $outperm # rotate right/left + vsel v2, $outhead, v1, $outmask + vmr $outhead, v1 + stvx v2, 0, $out + blr + +.align 4 +Lschedule_mangle_dec: + # inverse mix columns + # lea .Lk_dksd(%rip),%r11 + vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi + #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + # vmovdqa 0x00(%r11), %xmm2 + vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2 + # vmovdqa 0x10(%r11), %xmm3 + vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3 + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 + vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 + + # vmovdqa 0x20(%r11), %xmm2 + vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2 + vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 + # vmovdqa 0x30(%r11), %xmm3 + vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3 + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 + vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 + + # vmovdqa 0x40(%r11), %xmm2 + vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2 + vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 + # vmovdqa 0x50(%r11), %xmm3 + vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3 + vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 + + # vmovdqa 0x60(%r11), %xmm2 + vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2 + vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 + # vmovdqa 0x70(%r11), %xmm4 + vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4 + lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 + vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 + vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3 + + addi $out, $out, -16 # add \$-16, %rdx + + vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 + addi r8, r8, -16 # add \$-16, %r8 + andi. r8, r8, 0x30 # and \$0x30, %r8 + + #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) + vperm v1, v3, v3, $outperm # rotate right/left + vsel v2, $outhead, v1, $outmask + vmr $outhead, v1 + stvx v2, 0, $out + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .vpaes_set_encrypt_key +.align 5 +.vpaes_set_encrypt_key: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r0 + mfspr r6, 256 # save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r6,`$FRAME-4`($sp) # save vrsave + li r7, -1 + $PUSH r0, `$FRAME+$LRSAVE`($sp) + mtspr 256, r7 # preserve all AltiVec registers + + srwi r9, $bits, 5 # shr \$5,%eax + addi r9, r9, 6 # add \$5,%eax + stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + cmplw $dir, $bits, $bits # set encrypt direction + li r8, 0x30 # mov \$0x30,%r8d + bl _vpaes_schedule_core + + $POP r0, `$FRAME+$LRSAVE`($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256, r6 # restore vrsave + mtlr r0 + xor r3, r3, r3 + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,3,0 + .long 0 +.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key + +.globl .vpaes_set_decrypt_key +.align 4 +.vpaes_set_decrypt_key: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r0 + mfspr r6, 256 # save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r6,`$FRAME-4`($sp) # save vrsave + li r7, -1 + $PUSH r0, `$FRAME+$LRSAVE`($sp) + mtspr 256, r7 # preserve all AltiVec registers + + srwi r9, $bits, 5 # shr \$5,%eax + addi r9, r9, 6 # add \$5,%eax + stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + slwi r9, r9, 4 # shl \$4,%eax + add $out, $out, r9 # lea (%rdx,%rax),%rdx + + cmplwi $dir, $bits, 0 # set decrypt direction + srwi r8, $bits, 1 # shr \$1,%r8d + andi. r8, r8, 32 # and \$32,%r8d + xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + $POP r0, `$FRAME+$LRSAVE`($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256, r6 # restore vrsave + mtlr r0 + xor r3, r3, r3 + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,3,0 + .long 0 +.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key +___ +} + +my $consts=1; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + # constants table endian-specific conversion + if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$2; + my @bytes=(); + + # convert to endian-agnostic format + foreach (split(/,\s+/,$1)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl/crypto/aes/asm/vpaes-x86.pl b/openssl/crypto/aes/asm/vpaes-x86.pl index 1533e2c30..2ba149c3f 100644 --- a/openssl/crypto/aes/asm/vpaes-x86.pl +++ b/openssl/crypto/aes/asm/vpaes-x86.pl @@ -27,9 +27,10 @@ # # aes-586.pl vpaes-x86.pl # -# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) -# Nehalem 27.9/40.4/18.1 10.3/12.0 -# Atom 102./119./60.1 64.5/85.3(***) +# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) +# Nehalem 27.9/40.4/18.1 10.2/11.9 +# Atom 70.7/92.1/60.1 61.1/75.4(***) +# Silvermont 45.4/62.9/24.1 49.2/61.1(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -40,8 +41,8 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +32%/65% improvement on Core 2 -# and +58%/40% on Atom (as implied, over "hyper-threading-safe" +# pshufb, yet it's respectable +28%/64% improvement on Core 2 +# and +15% on Atom (as implied, over "hyper-threading-safe" # code path). # # @@ -183,35 +184,35 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1","xmm6") &movdqa ("xmm2",&QWP($k_ipt,$const)); &pandn ("xmm1","xmm0"); - &movdqu ("xmm5",&QWP(0,$key)); - &psrld ("xmm1",4); &pand ("xmm0","xmm6"); + &movdqu ("xmm5",&QWP(0,$key)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP($k_ipt+16,$const)); - &pshufb ("xmm0","xmm1"); &pxor ("xmm2","xmm5"); - &pxor ("xmm0","xmm2"); + &psrld ("xmm1",4); &add ($key,16); + &pshufb ("xmm0","xmm1"); &lea ($base,&DWP($k_mc_backward,$const)); + &pxor ("xmm0","xmm2"); &jmp (&label("enc_entry")); &set_label("enc_loop",16); # middle of middle round &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u - &pshufb ("xmm4","xmm2"); # 4 = sb1u - &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm4","xmm2"); # 4 = sb1u &pshufb ("xmm0","xmm3"); # 0 = sb1t - &pxor ("xmm0","xmm4"); # 0 = A + &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u - &pshufb ("xmm5","xmm2"); # 4 = sb2u + &pxor ("xmm0","xmm4"); # 0 = A &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &pshufb ("xmm5","xmm2"); # 4 = sb2u &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t - &pshufb ("xmm2","xmm3"); # 2 = sb2t - &pxor ("xmm2","xmm5"); # 2 = 2A &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &pshufb ("xmm2","xmm3"); # 2 = sb2t &movdqa ("xmm3","xmm0"); # 3 = A + &pxor ("xmm2","xmm5"); # 2 = 2A &pshufb ("xmm0","xmm1"); # 0 = B &add ($key,16); # next key &pxor ("xmm0","xmm2"); # 0 = 2A+B @@ -220,30 +221,30 @@ $k_dsbo=0x2c0; # decryption sbox final output &pxor ("xmm3","xmm0"); # 3 = 2A+B+D &pshufb ("xmm0","xmm1"); # 0 = 2B+C &and ($magic,0x30); # ... mod 4 - &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &sub ($round,1); # nr-- + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &set_label("enc_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k - &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm5","xmm0"); # 2 = a/k - &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i - &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j - &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak - &movdqu ("xmm5",&QWP(0,$key)); + &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("enc_loop")); @@ -265,8 +266,8 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Same API as encryption core. ## &function_begin_B("_vpaes_decrypt_core"); - &mov ($round,&DWP(240,$key)); &lea ($base,&DWP($k_dsbd,$const)); + &mov ($round,&DWP(240,$key)); &movdqa ("xmm1","xmm6"); &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); &pandn ("xmm1","xmm0"); @@ -292,62 +293,61 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Inverse mix columns ## &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u + &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u - &pxor ("xmm4","xmm0"); - &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t - &pshufb ("xmm0","xmm3"); # 0 = sb9t - &pxor ("xmm0","xmm4"); # 0 = ch - &add ($key,16); # next round key - - &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sb9t + &pxor ("xmm0","xmm4"); &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu - &pshufb ("xmm4","xmm2"); # 4 = sbdu - &pxor ("xmm4","xmm0"); # 4 = ch - &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt - &pshufb ("xmm0","xmm3"); # 0 = sbdt - &pxor ("xmm0","xmm4"); # 0 = ch - &sub ($round,1); # nr-- + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt + &pshufb ("xmm4","xmm2"); # 4 = sbdu &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbdt + &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu - &pshufb ("xmm4","xmm2"); # 4 = sbbu - &pxor ("xmm4","xmm0"); # 4 = ch - &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt - &pshufb ("xmm0","xmm3"); # 0 = sbbt - &pxor ("xmm0","xmm4"); # 0 = ch + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt + &pshufb ("xmm4","xmm2"); # 4 = sbbu &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbbt + &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu - &pshufb ("xmm4","xmm2"); # 4 = sbeu - &pxor ("xmm4","xmm0"); # 4 = ch - &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet - &pshufb ("xmm0","xmm3"); # 0 = sbet - &pxor ("xmm0","xmm4"); # 0 = ch + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet + &pshufb ("xmm4","xmm2"); # 4 = sbeu + &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbet + &pxor ("xmm0","xmm4"); # 4 = ch + &add ($key,16); # next round key &palignr("xmm5","xmm5",12); + &pxor ("xmm0","xmm1"); # 0 = ch + &sub ($round,1); # nr-- &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 - &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k - &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &psrld ("xmm1",4); # 1 = i &pshufb ("xmm2","xmm0"); # 2 = a/k - &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i - &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak - &pxor ("xmm3","xmm1"); # 3 = jo &movdqu ("xmm0",&QWP(0,$key)); + &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("dec_loop")); # middle of last round @@ -542,12 +542,12 @@ $k_dsbo=0x2c0; # decryption sbox final output ## %xmm0: b+c+d b+c b a ## &function_begin_B("_vpaes_schedule_192_smear"); - &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 - &pxor ("xmm6","xmm0"); # -> c+d c 0 0 + &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a + &pxor ("xmm6","xmm1"); # -> c+d c 0 0 + &pxor ("xmm1","xmm1"); &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a &movdqa ("xmm0","xmm6"); - &pxor ("xmm1","xmm1"); &movhlps("xmm6","xmm1"); # clobber low side with zeros &ret (); &function_end_B("_vpaes_schedule_192_smear"); diff --git a/openssl/crypto/aes/asm/vpaes-x86_64.pl b/openssl/crypto/aes/asm/vpaes-x86_64.pl index bd7f45b85..f2ef318fa 100644 --- a/openssl/crypto/aes/asm/vpaes-x86_64.pl +++ b/openssl/crypto/aes/asm/vpaes-x86_64.pl @@ -27,9 +27,10 @@ # # aes-x86_64.pl vpaes-x86_64.pl # -# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) -# Nehalem 30.5/42.2/14.6 9.8/11.8 -# Atom 63.9/79.0/32.1 64.0/84.8(***) +# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) +# Nehalem 29.6/40.3/14.6 10.0/11.8 +# Atom 57.3/74.2/32.1 60.9/77.2(***) +# Silvermont 52.7/64.0/19.5 48.8/60.8(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -40,7 +41,7 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +40%/78% improvement on Core 2 +# pshufb, yet it's respectable +36%/62% improvement on Core 2 # (as implied, over "hyper-threading-safe" code path). # # @@ -95,8 +96,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor %xmm5, %xmm2 - pxor %xmm2, %xmm0 add \$16, %r9 + pxor %xmm2, %xmm0 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry @@ -104,19 +105,19 @@ _vpaes_encrypt_core: .Lenc_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u - pshufb %xmm2, %xmm4 # 4 = sb1u - pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm3, %xmm0 # 0 = sb1t - pxor %xmm4, %xmm0 # 0 = A + pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm15, %xmm5 # 4 : sb2u - pshufb %xmm2, %xmm5 # 4 = sb2u + pxor %xmm4, %xmm0 # 0 = A movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t - pxor %xmm5, %xmm2 # 2 = 2A - movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm0, %xmm3 # 3 = A + pxor %xmm5, %xmm2 # 2 = 2A pshufb %xmm1, %xmm0 # 0 = B add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B @@ -125,30 +126,30 @@ _vpaes_encrypt_core: pxor %xmm0, %xmm3 # 3 = 2A+B+D pshufb %xmm1, %xmm0 # 0 = 2B+C and \$0x30, %r11 # ... mod 4 - pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D .Lenc_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm11, %xmm5 # 2 : a/k pandn %xmm0, %xmm1 # 1 = i<<4 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k - movdqa %xmm11, %xmm5 # 2 : a/k pshufb %xmm0, %xmm5 # 2 = a/k - pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i - pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j - pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pshufb %xmm3, %xmm2 # 2 = 1/iak - pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak - movdqu (%r9), %xmm5 + pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak + movdqu (%r9), %xmm5 pxor %xmm1, %xmm3 # 3 = jo jnz .Lenc_loop @@ -201,62 +202,61 @@ _vpaes_decrypt_core: ## Inverse mix columns ## movdqa -0x20(%r10),%xmm4 # 4 : sb9u + movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u - pxor %xmm0, %xmm4 - movdqa -0x10(%r10),%xmm0 # 0 : sb9t - pshufb %xmm3, %xmm0 # 0 = sb9t - pxor %xmm4, %xmm0 # 0 = ch - add \$16, %r9 # next round key - - pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sb9t + pxor %xmm4, %xmm0 movdqa 0x00(%r10),%xmm4 # 4 : sbdu + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x10(%r10),%xmm1 # 0 : sbdt + pshufb %xmm2, %xmm4 # 4 = sbdu - pxor %xmm0, %xmm4 # 4 = ch - movdqa 0x10(%r10),%xmm0 # 0 : sbdt - pshufb %xmm3, %xmm0 # 0 = sbdt - pxor %xmm4, %xmm0 # 0 = ch - sub \$1,%rax # nr-- - pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbdt + pxor %xmm4, %xmm0 # 4 = ch movdqa 0x20(%r10),%xmm4 # 4 : sbbu + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x30(%r10),%xmm1 # 0 : sbbt + pshufb %xmm2, %xmm4 # 4 = sbbu - pxor %xmm0, %xmm4 # 4 = ch - movdqa 0x30(%r10),%xmm0 # 0 : sbbt - pshufb %xmm3, %xmm0 # 0 = sbbt - pxor %xmm4, %xmm0 # 0 = ch - pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbbt + pxor %xmm4, %xmm0 # 4 = ch movdqa 0x40(%r10),%xmm4 # 4 : sbeu - pshufb %xmm2, %xmm4 # 4 = sbeu - pxor %xmm0, %xmm4 # 4 = ch - movdqa 0x50(%r10),%xmm0 # 0 : sbet - pshufb %xmm3, %xmm0 # 0 = sbet - pxor %xmm4, %xmm0 # 0 = ch + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x50(%r10),%xmm1 # 0 : sbet + pshufb %xmm2, %xmm4 # 4 = sbeu + pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbet + pxor %xmm4, %xmm0 # 4 = ch + add \$16, %r9 # next round key palignr \$12, %xmm5, %xmm5 - + pxor %xmm1, %xmm0 # 0 = ch + sub \$1,%rax # nr-- + .Ldec_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 + movdqa %xmm11, %xmm2 # 2 : a/k psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k - movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k - pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i - pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak - pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak + pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak - pxor %xmm1, %xmm3 # 3 = jo movdqu (%r9), %xmm0 + pxor %xmm1, %xmm3 # 3 = jo jnz .Ldec_loop # middle of last round @@ -464,12 +464,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,\@abi-omnipotent .align 16 _vpaes_schedule_192_smear: - pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 - pxor %xmm0, %xmm6 # -> c+d c 0 0 + pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + pxor %xmm1, %xmm6 # -> c+d c 0 0 + pxor %xmm1, %xmm1 pxor %xmm0, %xmm6 # -> b+c+d b+c b a movdqa %xmm6, %xmm0 - pxor %xmm1, %xmm1 movhlps %xmm1, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -- cgit v1.2.3