aboutsummaryrefslogtreecommitdiff
path: root/openssl/crypto/modes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'openssl/crypto/modes/asm')
-rwxr-xr-xopenssl/crypto/modes/asm/aesni-gcm-x86_64.pl1057
-rw-r--r--openssl/crypto/modes/asm/ghash-armv4.pl232
-rw-r--r--openssl/crypto/modes/asm/ghash-s390x.pl6
-rw-r--r--openssl/crypto/modes/asm/ghash-sparcv9.pl247
-rw-r--r--openssl/crypto/modes/asm/ghash-x86.pl199
-rw-r--r--openssl/crypto/modes/asm/ghash-x86_64.pl1149
-rwxr-xr-xopenssl/crypto/modes/asm/ghashp8-ppc.pl234
-rwxr-xr-xopenssl/crypto/modes/asm/ghashv8-armx.pl241
8 files changed, 3100 insertions, 265 deletions
diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100755
index 000000000..7e4e04ea2
--- /dev/null
+++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ sub \$6,$len
+ vpxor $Z0,$Z0,$Z0 # $Z0 = 0
+ vmovdqu 0x00-0x80($key),$rndkey
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpaddb $T2,$inout2,$inout3
+ vpaddb $T2,$inout3,$inout4
+ vpaddb $T2,$inout4,$inout5
+ vpxor $rndkey,$T1,$inout0
+ vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32 # discard $inout[1-5]?
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddb $T2,$inout5,$T1 # next counter value
+ vpxor $rndkey,$inout1,$inout1
+ vpxor $rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+ vmovdqu $T1,($ivp) # save next counter value
+ vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
+ vpxor $rndkey,$inout3,$inout3
+ vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
+ vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
+ xor %r12,%r12
+ cmp $in0,$end0
+
+ vaesenc $T2,$inout0,$inout0
+ vmovdqu 0x30+8(%rsp),$Ii # I[4]
+ vpxor $rndkey,$inout4,$inout4
+ vpclmulqdq \$0x00,$Hkey,$Z3,$T1
+ vaesenc $T2,$inout1,$inout1
+ vpxor $rndkey,$inout5,$inout5
+ setnc %r12b
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vaesenc $T2,$inout2,$inout2
+ vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
+ neg %r12
+ vaesenc $T2,$inout3,$inout3
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+ vaesenc $T2,$inout4,$inout4
+ vpxor $Z1,$T1,$Z0
+ and \$0x60,%r12
+ vmovups 0x20-0x80($key),$rndkey
+ vpclmulqdq \$0x10,$Hkey,$Ii,$T1
+ vaesenc $T2,$inout5,$inout5
+
+ vpclmulqdq \$0x01,$Hkey,$Ii,$T2
+ lea ($in0,%r12),$in0
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
+ vmovdqu 0x40+8(%rsp),$Ii # I[3]
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x58($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x50($in0),%r12
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x20+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x28+8(%rsp)
+ vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x30-0x80($key),$rndkey
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Z1,$Ii,$T1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x10,$Z1,$Ii,$T2
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Hkey,$Z3,$Z3
+ vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
+ vaesenc $rndkey,$inout2,$inout2
+ vpclmulqdq \$0x11,$Z1,$Ii,$Z1
+ vmovdqu 0x50+8(%rsp),$Ii # I[2]
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor $T1,$Z0,$Z0
+ vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x40-0x80($key),$rndkey
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x00,$T1,$Ii,$T2
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x10,$T1,$Ii,$Hkey
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x48($in0),%r13
+ vpxor $Z1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T1,$Ii,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x40($in0),%r12
+ vpclmulqdq \$0x11,$T1,$Ii,$T1
+ vmovdqu 0x60+8(%rsp),$Ii # I[1]
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x30+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x38+8(%rsp)
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x50-0x80($key),$rndkey
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x00,$T2,$Ii,$Hkey
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$T2,$Ii,$Z1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x38($in0),%r13
+ vpxor $T1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T2,$Ii,$T1
+ vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x30($in0),%r12
+ vpclmulqdq \$0x11,$T2,$Ii,$T2
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x40+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x48+8(%rsp)
+ vpxor $Hkey,$Z0,$Z0
+ vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x60-0x80($key),$rndkey
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x01,$Hkey,$Xi,$T1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x28($in0),%r13
+ vpxor $T2,$Z3,$Z3
+ vpclmulqdq \$0x00,$Hkey,$Xi,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x20($in0),%r12
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x50+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x58+8(%rsp)
+ vpxor $Z1,$Z2,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor $T1,$Z2,$Z2
+
+ vmovups 0x70-0x80($key),$rndkey
+ vpslldq \$8,$Z2,$Z1
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Xi,$Z3,$Z3
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Z1,$Z0,$Z0
+ movbe 0x18($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x10($in0),%r12
+ vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ mov %r13,0x60+8(%rsp)
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r12,0x68+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vmovups 0x90-0x80($key),$rndkey
+ vaesenc $T1,$inout1,$inout1
+ vpsrldq \$8,$Z2,$Z2
+ vaesenc $T1,$inout2,$inout2
+ vpxor $Z2,$Z3,$Z3
+ vaesenc $T1,$inout3,$inout3
+ vpxor $Ii,$Z0,$Z0
+ movbe 0x08($in0),%r13
+ vaesenc $T1,$inout4,$inout4
+ movbe 0x00($in0),%r12
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xa0-0x80($key),$T1
+ cmp \$11,$rounds
+ jb .Lenc_tail # 128-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xb0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xc0-0x80($key),$T1
+ je .Lenc_tail # 192-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xd0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xe0-0x80($key),$T1
+ jmp .Lenc_tail # 256-bit key
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $rndkey,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $rndkey,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpshufb $Ii,$inout5,$inout5
+ vpshufb $Ii,$T1,$T1 # next counter value
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc $rndkey,$inout0,$inout0
+ vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
+ vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
+ vaesenc $rndkey,$inout1,$inout1
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ vpxor 0x00($inp),$T1,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x10($inp),$T1,$Ii
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x20($inp),$T1,$Z1
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x30($inp),$T1,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x40($inp),$T1,$Z3
+ vpxor 0x50($inp),$T1,$Hkey
+ vmovdqu ($ivp),$T1 # load next counter value
+
+ vaesenclast $T2,$inout0,$inout0
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ vaesenclast $Ii,$inout1,$inout1
+ vpaddb $T2,$T1,$Ii
+ mov %r13,0x70+8(%rsp)
+ lea 0x60($inp),$inp
+ vaesenclast $Z1,$inout2,$inout2
+ vpaddb $T2,$Ii,$Z1
+ mov %r12,0x78+8(%rsp)
+ lea 0x60($out),$out
+ vmovdqu 0x00-0x80($key),$rndkey
+ vaesenclast $Z2,$inout3,$inout3
+ vpaddb $T2,$Z1,$Z2
+ vaesenclast $Z3, $inout4,$inout4
+ vpaddb $T2,$Z2,$Z3
+ vaesenclast $Hkey,$inout5,$inout5
+ vpaddb $T2,$Z3,$Hkey
+
+ add \$0x60,$ret
+ sub \$0x6,$len
+ jc .L6x_done
+
+ vmovups $inout0,-0x60($out) # save output
+ vpxor $rndkey,$T1,$inout0
+ vmovups $inout1,-0x50($out)
+ vmovdqa $Ii,$inout1 # 0 latency
+ vmovups $inout2,-0x40($out)
+ vmovdqa $Z1,$inout2 # 0 latency
+ vmovups $inout3,-0x30($out)
+ vmovdqa $Z2,$inout3 # 0 latency
+ vmovups $inout4,-0x20($out)
+ vmovdqa $Z3,$inout4 # 0 latency
+ vmovups $inout5,-0x10($out)
+ vmovdqa $Hkey,$inout5 # 0 latency
+ vmovdqu 0x20+8(%rsp),$Z3 # I[5]
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+
+ ret
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+# const AES_KEY *key, unsigned char iv[16],
+# struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@function,6
+.align 32
+aesni_gcm_decrypt:
+ xor $ret,$ret
+ cmp \$0x60,$len # minimal accepted length
+ jb .Lgcm_dec_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ vmovdqu ($Xip),$Xi # load Xi
+ and \$-128,%rsp # ensure stack alignment
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ lea 0x80($key),$key # size optimization
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ mov 0xf0-0x80($key),$rounds
+ vpshufb $Ii,$Xi,$Xi
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Ldec_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Ldec_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+ vmovdqu 0x50($inp),$Z3 # I[5]
+ lea ($inp),$in0
+ vmovdqu 0x40($inp),$Z0
+ lea -0xc0($inp,$len),$end0
+ vmovdqu 0x30($inp),$Z1
+ shr \$4,$len
+ xor $ret,$ret
+ vmovdqu 0x20($inp),$Z2
+ vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu 0x10($inp),$T2
+ vpshufb $Ii,$Z0,$Z0
+ vmovdqu ($inp),$Hkey
+ vpshufb $Ii,$Z1,$Z1
+ vmovdqu $Z0,0x30(%rsp)
+ vpshufb $Ii,$Z2,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$T2,$T2
+ vmovdqu $Z2,0x50(%rsp)
+ vpshufb $Ii,$Hkey,$Hkey
+ vmovdqu $T2,0x60(%rsp)
+ vmovdqu $Hkey,0x70(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups $inout0,-0x60($out) # save output
+ vmovups $inout1,-0x50($out)
+ vmovups $inout2,-0x40($out)
+ vmovups $inout3,-0x30($out)
+ vmovups $inout4,-0x20($out)
+ vmovups $inout5,-0x10($out)
+
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xd8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_dec_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type _aesni_ctr32_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ lea -1($rounds),%r13
+ vmovups 0x10-0x80($key),$rndkey
+ lea 0x20-0x80($key),%r12
+ vpxor $Z0,$T1,$inout0
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32_2
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddb $T2,$inout2,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddb $T2,$inout3,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpaddb $T2,$inout4,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpaddb $T2,$inout5,$T1
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+ vmovups (%r12),$rndkey
+ lea 0x10(%r12),%r12
+ dec %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),$Hkey # last round key
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 0x00($inp),$Hkey,$Z0
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor 0x10($inp),$Hkey,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x20($inp),$Hkey,$Z2
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x30($inp),$Hkey,$Xi
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x40($inp),$Hkey,$T2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x50($inp),$Hkey,$Hkey
+ lea 0x60($inp),$inp
+
+ vaesenclast $Z0,$inout0,$inout0
+ vaesenclast $Z1,$inout1,$inout1
+ vaesenclast $Z2,$inout2,$inout2
+ vaesenclast $Xi,$inout3,$inout3
+ vaesenclast $T2,$inout4,$inout4
+ vaesenclast $Hkey,$inout5,$inout5
+ vmovups $inout0,0x00($out)
+ vmovups $inout1,0x10($out)
+ vmovups $inout2,0x20($out)
+ vmovups $inout3,0x30($out)
+ vmovups $inout4,0x40($out)
+ vmovups $inout5,0x50($out)
+ lea 0x60($out),$out
+
+ ret
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpshufb $Ii,$inout5,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpshufb $Ii,$T1,$T1 # next counter value
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@function,6
+.align 32
+aesni_gcm_encrypt:
+ xor $ret,$ret
+ cmp \$0x60*3,$len # minimal accepted length
+ jb .Lgcm_enc_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ lea 0x80($key),$key # size optimization
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ and \$-128,%rsp # ensure stack alignment
+ mov 0xf0-0x80($key),$rounds
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Lenc_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Lenc_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+ lea ($out),$in0
+ lea -0xc0($out,$len),$end0
+ shr \$4,$len
+
+ call _aesni_ctr32_6x
+ vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
+ vpshufb $Ii,$inout1,$T2
+ vmovdqu $Xi,0x70(%rsp)
+ vpshufb $Ii,$inout2,$Z0
+ vmovdqu $T2,0x60(%rsp)
+ vpshufb $Ii,$inout3,$Z1
+ vmovdqu $Z0,0x50(%rsp)
+ vpshufb $Ii,$inout4,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu $Z2,0x30(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu ($Xip),$Xi # load Xi
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ sub \$12,$len
+ mov \$0x60*2,$ret
+ vpshufb $Ii,$Xi,$Xi
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 0x20(%rsp),$Z3 # I[5]
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $Z3,$Z3,$T1
+ vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
+ vmovups $inout0,-0x60($out) # save output
+ vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
+ vpxor $Z3,$T1,$T1
+ vmovups $inout1,-0x50($out)
+ vpshufb $Ii,$inout1,$inout1
+ vmovups $inout2,-0x40($out)
+ vpshufb $Ii,$inout2,$inout2
+ vmovups $inout3,-0x30($out)
+ vpshufb $Ii,$inout3,$inout3
+ vmovups $inout4,-0x20($out)
+ vpshufb $Ii,$inout4,$inout4
+ vmovups $inout5,-0x10($out)
+ vpshufb $Ii,$inout5,$inout5
+ vmovdqu $inout0,0x10(%rsp) # free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+ vmovdqu 0x30(%rsp),$Z2 # I[4]
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpunpckhqdq $Z2,$Z2,$T2
+ vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
+ vpxor $Z2,$T2,$T2
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+
+ vmovdqu 0x40(%rsp),$T3 # I[3]
+ vpclmulqdq \$0x00,$Ii,$Z2,$Z0
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $T3,$T3,$Z1
+ vpclmulqdq \$0x11,$Ii,$Z2,$Z2
+ vpxor $T3,$Z1,$Z1
+ vpxor $Z3,$Z2,$Z2
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vmovdqu 0x50(%rsp),$T1 # I[2]
+ vpclmulqdq \$0x00,$Hkey,$T3,$Z3
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z0,$Z3,$Z3
+ vpunpckhqdq $T1,$T1,$Z0
+ vpclmulqdq \$0x11,$Hkey,$T3,$T3
+ vpxor $T1,$Z0,$Z0
+ vpxor $Z2,$T3,$T3
+ vpclmulqdq \$0x00,$HK,$Z1,$Z1
+ vpxor $T2,$Z1,$Z1
+
+ vmovdqu 0x60(%rsp),$T2 # I[1]
+ vpclmulqdq \$0x00,$Ii,$T1,$Z2
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z3,$Z2,$Z2
+ vpunpckhqdq $T2,$T2,$Z3
+ vpclmulqdq \$0x11,$Ii,$T1,$T1
+ vpxor $T2,$Z3,$Z3
+ vpxor $T3,$T1,$T1
+ vpclmulqdq \$0x10,$HK,$Z0,$Z0
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $Z1,$Z0,$Z0
+
+ vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
+ vpclmulqdq \$0x00,$Hkey,$T2,$Z1
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpxor $Z2,$Z1,$Z1
+ vpclmulqdq \$0x11,$Hkey,$T2,$T2
+ vpxor $Xi,$T3,$T3
+ vpxor $T1,$T2,$T2
+ vpclmulqdq \$0x00,$HK,$Z3,$Z3
+ vpxor $Z0,$Z3,$Z0
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $inout5,$inout5,$T1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Xi
+ vpxor $inout5,$T1,$T1
+ vpxor $Z1,$Z2,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$T3
+ vmovdqu 0x20-0x20($Xip),$HK
+ vpxor $T2,$Xi,$Z3
+ vpxor $Z0,$T3,$Z2
+
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
+ vpxor $T3,$Z2,$Z2
+ vpunpckhqdq $inout4,$inout4,$T2
+ vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
+ vpxor $inout4,$T2,$T2
+ vpslldq \$8,$Z2,$T3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+ vpxor $T3,$Z1,$Xi
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $Z2,$Z3,$Z3
+
+ vpclmulqdq \$0x00,$Ii,$inout4,$Z1
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout3,$inout3,$T3
+ vpclmulqdq \$0x11,$Ii,$inout4,$inout4
+ vpxor $inout3,$T3,$T3
+ vpxor $inout5,$inout4,$inout4
+ vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $inout2,$inout2,$T1
+ vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
+ vpxor $inout2,$T1,$T1
+ vpxor $inout4,$inout3,$inout3
+ vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
+ vpclmulqdq \$0x00,$HK,$T3,$T3
+ vpxor $T2,$T3,$T3
+
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Ii,$inout2,$Z1
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout1,$inout1,$T2
+ vpclmulqdq \$0x11,$Ii,$inout2,$inout2
+ vpxor $inout1,$T2,$T2
+ vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
+ vpxor $inout3,$inout2,$inout2
+ vpclmulqdq \$0x10,$HK,$T1,$T1
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $T3,$T1,$T1
+
+ vxorps $Z3,$inout5,$inout5
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
+ vpxor $Xi,$T3,$T3
+ vpxor $inout2,$inout1,$inout1
+ vpclmulqdq \$0x00,$HK,$T2,$T2
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Z3
+ vpxor $Z0,$Z1,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$Z2
+ vpxor $inout1,$Z3,$Z3
+ vpxor $T2,$Z2,$Z2
+
+ vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
+ vpxor $Z0,$Z2,$Z2
+ vpslldq \$8,$Z2,$T1
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $T1,$Z1,$Xi
+ vpxor $Z2,$Z3,$Z3
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $Z3,$T2,$T2
+ vpxor $T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_enc_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+ .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type gcm_se_handler,\@abi-omnipotent
+.align 16
+gcm_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 120($context),%rax # pull context->Rax
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size gcm_se_handler,.-gcm_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_gcm_decrypt
+ .rva .LSEH_end_aesni_gcm_decrypt
+ .rva .LSEH_gcm_dec_info
+
+ .rva .LSEH_begin_aesni_gcm_encrypt
+ .rva .LSEH_end_aesni_gcm_encrypt
+ .rva .LSEH_gcm_enc_info
+.section .xdata
+.align 8
+.LSEH_gcm_dec_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___; # assembler is too old
+.text
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl
index d91586ee2..77fbf3446 100644
--- a/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,161 @@ $code.=<<___;
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 $IN#hi,[r1,:64]! @ load H
+ vmov.i8 $t0,#0xe1
+ vld1.64 $IN#lo,[r1,:64]
+ vshl.i64 $t0#hi,#57
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
+ vdup.8 $t1,$IN#hi[7]
+ vshr.u64 $Hlo,$IN#lo,#63
+ vshr.s8 $t1,#7 @ broadcast carry bit
+ vshl.i64 $IN,$IN,#1
+ vand $t0,$t0,$t1
+ vorr $IN#hi,$Hlo @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vstmia r0,{$IN}
+
+ ret @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Htbl,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
+ vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $IN#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Qpost,$Qpost
- veor $R,$R
- mov $cnt,#16
- veor $Z,$Z
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
- veor $Zo,$Zo
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- b .Linner_neon
+ b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Xi,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
- nop
+ vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
-.Louter_neon:
- vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
- veor $Qpost,$Qpost
- vld1.64 `&Dlo($IN)`,[$inp]!
- veor $R,$R
- mov $cnt,#16
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 $IN#hi,[$inp]! @ load inp
+ vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Zo,$Zo
- veor $IN,$Z @ inp^=Xi
- veor $Z,$Z
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
-.Linner_neon:
- subs $cnt,$cnt,#1
- vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
- vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
- vext.8 $IN,$zero,#1 @ IN>>=8
-
- veor $Z,$Qpost @ modulo-scheduled part
- vshl.i64 `&Dlo("$R")`,#48
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
- veor `&Dhi("$Z")`,`&Dlo("$R")`
- vuzp.8 $Qlo,$Qhi
- vsli.8 $Zo,$T,#1 @ compose the "carry" byte
- vext.8 $Z,$zero,#1 @ Z>>=8
-
- vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
- vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
- vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
- veor $Z,$Qhi
- bne .Linner_neon
-
- veor $Z,$Qpost @ modulo-scheduled artefact
- vshl.i64 `&Dlo("$R")`,#48
- veor `&Dhi("$Z")`,`&Dlo("$R")`
-
- @ finalization, normalize Z:Zo
- vand $Zo,$mod @ suffices to mask the bit
- vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
- vshl.i64 $Z,#1
+ veor $IN,$Xl @ inp^=Xi
+.Lgmult_neon:
+___
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
+$code.=<<___;
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
+___
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
+$code.=<<___;
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
+ veor $Xm,$Xm,$Xh
+ veor $Xl#hi,$Xl#hi,$Xm#lo
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
+ vshl.i64 $t2,$Xl,#62
+ veor $t2,$t2,$t1 @
+ vshl.i64 $t1,$Xl,#63
+ veor $t2, $t2, $t1 @
+ veor $Xl#hi,$Xl#hi,$t2#lo @
+ veor $Xh#lo,$Xh#lo,$t2#hi
+
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
+ veor $Xh,$Xh,$Xl
+ veor $Xl,$Xl,$t2 @
+ vshr.u64 $t2,$t2,#6
+ vshr.u64 $Xl,$Xl,#1 @
+ veor $Xl,$Xl,$Xh @
+ veor $Xl,$Xl,$t2 @
+
subs $len,#16
- vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
- bne .Louter_neon
+ bne .Loop_neon
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi,:64]
- bx lr
+ ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
@@ -423,7 +481,13 @@ $code.=<<___;
.align 2
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl
index 6a40d5d89..39096b423 100644
--- a/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -186,13 +186,13 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
- sllg $tmp,$Zhi,60
- xg $Zlo,8($nlo,$Htbl)
srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
@@ -213,9 +213,9 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
- sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl
index 70e7b044a..0365e0f1f 100644
--- a/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -36,6 +36,15 @@
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -66,6 +75,10 @@ $Htbl="%i1";
$inp="%i2";
$len="%i3";
+$code.=<<___ if ($bits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
$code.=<<___;
.section ".text",#alloc,#execinstr
@@ -321,10 +334,238 @@ gcm_gmult_4bit:
restore
.type gcm_gmult_4bit,#function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+ (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl gcm_init_vis3
+.align 32
+gcm_init_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [%i1+0],$Hhi
+ ldx [%i1+8],$Hlo
+ mov 0xE1,$Xhi
+ mov 1,$Xlo
+ sllx $Xhi,57,$Xhi
+ srax $Hhi,63,$C0 ! broadcast carry
+ addcc $Hlo,$Hlo,$Hlo ! H<<=1
+ addxc $Hhi,$Hhi,$Hhi
+ and $C0,$Xlo,$Xlo
+ and $C0,$Xhi,$Xhi
+ xor $Xlo,$Hlo,$Hlo
+ xor $Xhi,$Hhi,$Hhi
+ stx $Hlo,[%i0+8] ! save twisted H
+ stx $Hhi,[%i0+0]
+
+ sethi %hi(0xA0406080),$V
+ sethi %hi(0x20C0E000),%l0
+ or $V,%lo(0xA0406080),$V
+ or %l0,%lo(0x20C0E000),%l0
+ sllx $V,32,$V
+ or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
+ stx $V,[%i0+16]
+
+ ret
+ restore
+.type gcm_init_vis3,#function
+.size gcm_init_vis3,.-gcm_init_vis3
+
+.globl gcm_gmult_vis3
+.align 32
+gcm_gmult_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$Xlo ! load Xi
+ ldx [$Xip+0],$Xhi
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_gmult_vis3,#function
+.size gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl gcm_ghash_vis3
+.align 32
+gcm_ghash_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$C2 ! load Xi
+ ldx [$Xip+0],$C3
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ and $inp,7,$shl
+ andn $inp,7,$inp
+ sll $shl,3,$shl
+ prefetch [$inp+63], 20
+ sub %g0,$shl,$shr
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+.Loop:
+ ldx [$inp+8],$Xlo
+ brz,pt $shl,1f
+ ldx [$inp+0],$Xhi
+
+ ldx [$inp+16],$C1 ! align data
+ srlx $Xlo,$shr,$C0
+ sllx $Xlo,$shl,$Xlo
+ sllx $Xhi,$shl,$Xhi
+ srlx $C1,$shr,$C1
+ or $C0,$Xhi,$Xhi
+ or $C1,$Xlo,$Xlo
+1:
+ add $inp,16,$inp
+ sub $len,16,$len
+ xor $C2,$Xlo,$Xlo
+ xor $C3,$Xhi,$Xhi
+ prefetch [$inp+63], 20
+
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ brnz,pt $len,.Loop
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_ghash_vis3,#function
+.size gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "xmulx" => 0x115,
+ "xmulxhi" => 0x116 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl
index 83c727e07..23a5527b3 100644
--- a/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/openssl/crypto/modes/asm/ghash-x86.pl
@@ -12,25 +12,27 @@
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
-# gcc 2.95.3(*) MMX assembler x86 assembler
+# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
-# comparison with MMX results is not completely fair, because C
+# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
@@ -40,8 +42,8 @@
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
# May 2010
#
@@ -113,6 +115,16 @@
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -822,17 +834,18 @@ $len="ebx";
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
- &pshufd ($T2,$Hkey,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
- &pxor ($T2,$Hkey);
+ &pxor ($T2,$Hkey) if (!defined($HK));
+ $HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pclmulqdq ($T1,$T2,0x00); #######
+ &pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
@@ -879,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist.
# below. Algorithm 9 was therefore chosen for
# further optimization...
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
- &movdqa ($T1,$Xi); #
+ &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2); #
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
+ &psrlq ($Xi,1);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
+ &pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
+ &pshufd ($T1,$Hkey,0b01001110);
+ &pshufd ($T2,$Xi,0b01001110);
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
+ &movups ($T2,&QWP(32,$Htbl));
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
@@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
+ &movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
- &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
+ &jmp (&label("mod_loop"));
-&set_label("mod_loop");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("mod_loop",32);
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
+ &nop ();
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
+ &xorps ($Xhi,$Xhn);
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pxor ($T1,$Xhi); #
- &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
- &movdqa ($Xhn,$Xn);
- &pxor ($Xhi,$T1); # "Ii+Xi", consume early
+ &pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
- &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+ &pshufb ($Xn,$T3);
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
+
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pshufd ($T1,$T3,0b01001110);
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+ &pshufd ($T1,$Xhn,0b01001110);
+ &movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,1);
+ &pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
- &pxor ($T1,$T3);
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T3,$Hkey); #
-
&pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movdqa ($T2,$Xi); # 2nd phase
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-
+ &pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &xorps ($T1,$Xn); #
- &xorps ($T1,$Xhn); #
-
- &movdqa ($T3,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T3,8); #
- &pxor ($Xhn,$T1);
- &pxor ($Xn,$T3); #
- &movdqa ($T3,&QWP(0,$const));
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &pxor ($T1,$Xhi); #
+
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
@@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}} # $sse2
-
-&set_label("rem_4bit",64);
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
&set_label("rem_8bit",64);
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl
index 38d779edb..6e656ca13 100644
--- a/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
# P4 28.6 14.0 +100%
# Opteron 19.3 7.7 +150%
# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
#
# (*) comparison is not completely fair, because C results are
# for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,44 @@
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
+$do4xaggr=1;
+
# common register layout
$nlo="%rax";
$nhi="%rbx";
@@ -160,6 +221,7 @@ ___
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
@@ -352,19 +414,27 @@ ___
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
sub clmul64x64_T2 { # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
+} else {
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1 #
+___
+}
$code.=<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
- pclmulqdq \$0x00,$T2,$T1 #######
+ pclmulqdq \$0x00,$HK,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
@@ -376,42 +446,53 @@ $code.=<<___;
___
}
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
$code.=<<___;
# 1st phase
- movdqa $Xi,$T1 #
+ movdqa $Xi,$T2 #
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
# 2nd phase
movdqa $Xi,$T2
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pxor $Xhi,$Xi #
___
}
{ my ($Htbl,$Xip)=@_4args;
+ my $HK="%xmm6";
$code.=<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
@@ -430,13 +511,47 @@ gcm_init_clmul:
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
+ pshufd \$0b01001110,$Hkey,$HK
movdqa $Hkey,$Xi
+ pxor $Hkey,$HK
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
- movdqu $Hkey,($Htbl) # save H
- movdqu $Xi,16($Htbl) # save H^2
+ pshufd \$0b01001110,$Hkey,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $Hkey,$T1 # Karatsuba pre-processing
+ movdqu $Hkey,0x00($Htbl) # save H
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x10($Htbl) # save H^2
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
+___
+if ($do4xaggr) {
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqa $Xi,$T3
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$T3,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $T3,$T1 # Karatsuba pre-processing
+ movdqu $T3,0x30($Htbl) # save H^3
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x40($Htbl) # save H^4
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
@@ -449,13 +564,38 @@ $code.=<<___;
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$T2
pshufb $T3,$Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+ # experimental alternative. special thing about is that there
+ # no dependency between the two multiplications...
+ mov \$`0xE1<<1`,%eax
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0x07,%r11d
+ movq %rax,$T1
+ movq %r10,$T2
+ movq %r11,$T3 # borrow $T3
+ pand $Xi,$T3
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
+ movq %rax,$T3
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pxor $Xi,$T2
+ pslldq \$15,$T2
+ paddd $T2,$T2 # <<(64+56+1)
+ pxor $T2,$Xi
+ pclmulqdq \$0x01,$T3,$Xi
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
+ psrldq \$1,$T1
+ pxor $T1,$Xhi
+ pslldq \$7,$Xi
+ pxor $Xhi,$Xi
+___
$code.=<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
@@ -465,129 +605,327 @@ ___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my $Xn="%xmm6";
- my $Xhn="%xmm7";
- my $Hkey2="%xmm8";
- my $T1n="%xmm9";
- my $T2n="%xmm10";
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
-.align 16
+.align 32
gcm_ghash_clmul:
+.L_ghash_clmul:
___
$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$HK
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
- movdqu 16($Htbl),$Hkey2
+ movdqu 0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
+ cmp \$0x30,$len
+ jb .Lskip4x
+
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
+ sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
+ movdqu 0x30($Htbl),$Hkey3
+ movdqu 0x40($Htbl),$Hkey4
+
+ #######
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+ #
+ movdqu 0x30($inp),$Xln
+ movdqu 0x20($inp),$Xl
+ pshufb $T3,$Xln
+ pshufb $T3,$Xl
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey2,$Xl
+ pclmulqdq \$0x11,$Hkey2,$Xh
+ pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ xorps $Xm,$Xmn
+
+ movdqu 0x10($inp),$Xl
+ movdqu 0($inp),$T1
+ pshufb $T3,$Xl
+ pshufb $T3,$T1
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T1,$Xi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ movdqu 0x30($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ movdqu 0x20($inp),$Xln
+ movdqa $Xl,$Xh
+ pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
+ xorps $Xhn,$Xhi
+ pxor $Xl,$Xm
+ pshufb $T3,$Xln
+ movups 0x20($Htbl),$HK
+ xorps $Xmn,$T1
+ pclmulqdq \$0x00,$Hkey,$Xl
+ pshufd \$0b01001110,$Xln,$Xmn
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ movdqa $Xln,$Xhn
+ pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
+ movdqa $T1,$T2 #
+ pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ movdqa .L7_mask(%rip),$T1
+ pxor $T2,$Xhi #
+ movq %rax,$T2
+
+ pand $Xi,$T1 # 1st phase
+ pshufb $T1,$T2 #
+ pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
+ psllq \$57,$T2 #
+ movdqa $T2,$T1 #
+ pslldq \$8,$T2
+ pclmulqdq \$0x00,$Hkey2,$Xln
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ movdqu 0($inp),$T1
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhn
+ xorps $Xl,$Xln
+ movdqu 0x10($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x10,$HK,$Xmn
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ pshufb $T3,$T1
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+
+ movdqa $Xl,$Xh
+ pxor $Xm,$Xmn
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T2,$Xi #
+ pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jnc .Lmod4_loop
+
+.Ltail4x:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
+ xorps $Xhn,$Xhi
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
+ pxor $Xmn,$T1
+
+ pxor $Xhi,$T1 #
+ pxor $Xi,$Xhi
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+ &reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+ add \$0x40,$len
+ jz .Ldone
+ movdqu 0x20($Htbl),$HK
+ sub \$0x10,$len
+ jz .Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
- movdqu 16($inp),$Xn # Ii+1
+ movdqu 16($inp),$Xln # Ii+1
pshufb $T3,$T1
- pshufb $T3,$Xn
+ pshufb $T3,$Xln
pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
-$code.=<<___;
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
+
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+.align 32
.Lmod_loop:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- movdqu ($inp),$T1 # Ii
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
- movdqa $Xn,$Xhn #
- pshufd \$0b01001110,$Xn,$T1n
- pshufd \$0b01001110,$Hkey,$T2n
- pxor $Xn,$T1n #
- pxor $Hkey,$T2n
- pxor $T1,$Xhi # "Ii+Xi", consume early
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
+ movdqu 16($inp),$Xln # Ii+1
+
+ pxor $Xhi,$T1
+ pxor $T2,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$Xmn
+ pshufb $T3,$Xln
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
+
+ movdqa $Xln,$Xhn #
- movdqa $Xi,$T1 # 1st phase
+ movdqa $Xi,$T2 # 1st phase
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
+ pxor $T1,$Xhi #
+ pxor $Xhn,$Xmn #
- pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pclmulqdq \$0x00,$HK,$Xmn #######
+ pxor $Xhi,$Xi #
- pclmulqdq \$0x00,$T2n,$T1n #######
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- pxor $Xn,$T1n #
- pxor $Xhn,$T1n #
- movdqa $T1n,$T2n #
- psrldq \$8,$T1n
- pslldq \$8,$T2n #
- pxor $T1n,$Xhn
- pxor $T2n,$Xn #
-
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
+ pxor $Xi,$T1
+ pxor $Xhi,$T1
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
@@ -599,7 +937,7 @@ $code.=<<___;
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
.Ldone:
@@ -612,21 +950,607 @@ $code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
- add \$0x58,%rsp
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
___
$code.=<<___;
ret
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
+
+$code.=<<___;
+.globl gcm_init_avx
+.type gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Hkey
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ vpsrlq \$63,$Hkey,$T1
+ vpsllq \$1,$Hkey,$Hkey
+ vpxor $T3,$T3,$T3 #
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
+ vpslldq \$8,$T1,$T1
+ vpor $T1,$Hkey,$Hkey # H<<=1
+
+ # magic reduction
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ vpunpckhqdq $Hkey,$Hkey,$HK
+ vmovdqa $Hkey,$Xi
+ vpxor $Hkey,$HK,$HK
+ mov \$4,%r10 # up to H^8
+ jmp .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpunpckhqdq $Hkey,$Hkey,$T2
+ vpxor $Xi,$T1,$T1 #
+ vpxor $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpxor $Xi,$T1,$T1 #
+___
+}
+$code.=<<___;
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
+ vpxor $Xi,$Xhi,$T2 #
+ vpxor $T2,$T1,$T1 #
+
+ vpslldq \$8,$T1,$T2 #
+ vpsrldq \$8,$T1,$T1
+ vpxor $T2,$Xi,$Xi #
+ vpxor $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ vpsllq \$57,$Xi,$T1 # 1st phase
+ vpsllq \$62,$Xi,$T2
+ vpxor $T1,$T2,$T2 #
+ vpsllq \$63,$Xi,$T1
+ vpxor $T1,$T2,$T2 #
+ vpslldq \$8,$T2,$T1 #
+ vpsrldq \$8,$T2,$T2
+ vpxor $T1,$Xi,$Xi #
+ vpxor $T2,$Xhi,$Xhi
+
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
+ vpxor $Xi,$Xhi,$Xhi
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$5,$T2,$T2
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$1,$Xi,$Xi #
+ vpxor $Xhi,$Xi,$Xi #
+___
+}
$code.=<<___;
+.align 32
+.Linit_loop_avx:
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+ vmovdqa $Xi,$T3
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+ vpshufd \$0b01001110,$T3,$T1
+ vpshufd \$0b01001110,$Xi,$T2
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
+ lea 0x30($Htbl),$Htbl
+ sub \$1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
+ vmovdqu $T3,-0x10($Htbl)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+ $Zlo,$Zhi,$Zmi,
+ $Hkey,$HK,$T1,$T2,
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Xi # load $Xi
+ lea .L0x1c2_polynomial(%rip),%r10
+ lea 0x40($Htbl),$Htbl # size optimization
+ vmovdqu .Lbswap_mask(%rip),$bswap
+ vpshufb $bswap,$Xi,$Xi
+ cmp \$0x80,$len
+ jb .Lshort_avx
+ sub \$0x80,$len
+
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpshufb $bswap,$Ii,$Ii
+ vmovdqu 0x20-0x40($Htbl),$HK
+
+ vpunpckhqdq $Ii,$Ii,$T2
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Ii,$T2,$T2
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpxor $Ii,$T2,$T2
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpxor $Xmi,$Zmi,$Zmi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
+
+ lea 0x80($inp),$inp
+ cmp \$0x80,$len
+ jb .Ltail_avx
+
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+ sub \$0x80,$len
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpxor $Ij,$T1,$T1
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Zlo,$Xi,$Xi # collect result
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vxorps $Zhi,$Xo,$Xo
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Zmi,$Tred,$Tred
+ vxorps $Ij,$T1,$T1
+
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Xo,$Tred,$Tred
+ vpslldq \$8,$Tred,$T2
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vpsrldq \$8,$Tred,$Tred
+ vpxor $T2, $Xi, $Xi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ii
+ vxorps $Tred,$Xo, $Xo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vxorps $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+ vxorps $Tred,$Xi,$Xi
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vxorps $Xo,$Tred,$Tred
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Tred,$Ij,$Ij
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jnc .Loop8x_avx
+
+ add \$0x80,$len
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -0x10($inp,$len),$Ii # very last word
+ lea ($inp,$len),$inp
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpshufb $bswap,$Ii,$Ij
+
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
+ vmovdqa $Xhi,$Zhi # $Zhi and
+ vmovdqa $Xmi,$Zmi # $Zmi
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x20($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x30($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x40($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x50($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x60($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x70($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovq 0xb8-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+.Ltail_no_xor_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+
+ vmovdqu (%r10),$Tred
+
+ vpxor $Xlo,$Zlo,$Xi
+ vpxor $Xhi,$Zhi,$Xo
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
+ vpxor $Xo, $Zmi,$Zmi
+ vpslldq \$8, $Zmi,$T2
+ vpsrldq \$8, $Zmi,$Zmi
+ vpxor $T2, $Xi, $Xi
+ vpxor $Zmi,$Xo, $Xo
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $Xo,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ cmp \$0,$len
+ jne .Lshort_avx
+
+ vpshufb $bswap,$Xi,$Xi
+ vmovdqu $Xi,($Xip)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+ .long 7,0,7,0
+.L7_mask_poly:
+ .long 7,0,`0xE1<<1`,0
.align 64
.type .Lrem_4bit,\@object
.Lrem_4bit:
@@ -774,10 +1698,24 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+ .rva .LSEH_begin_gcm_init_clmul
+ .rva .LSEH_end_gcm_init_clmul
+ .rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_gcm_init_avx
+ .rva .LSEH_end_gcm_init_avx
+ .rva .LSEH_info_gcm_init_clmul
+ .rva .LSEH_begin_gcm_ghash_avx
+ .rva .LSEH_end_gcm_ghash_avx
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_gcm_gmult_4bit:
@@ -788,14 +1726,23 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_init_clmul:
+ .byte 0x01,0x08,0x03,0x00
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
.LSEH_info_gcm_ghash_clmul:
- .byte 0x01,0x1f,0x0b,0x00
- .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
- .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
___
}
diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 000000000..e76a58c34
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p8
+.align 5
+.gcm_init_p8:
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $H,$H,$t1 # twisted H
+
+ vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ stvx_u $H, r9,r3
+ stvx_u $Hh,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p8,.-.gcm_init_p8
+
+.globl .gcm_gmult_p8
+.align 5
+.gcm_gmult_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl .gcm_ghash_p8
+.align 5
+.gcm_ghash_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subi $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ b Loop
+
+.align 5
+Loop:
+ subic $len,$len,16
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ subfe. r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ add $inp,$inp,r0
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ beq Loop # did $len-=16 borrow?
+
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl
new file mode 100755
index 000000000..54a1ac4db
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+# PMULL[2] 32-bit NEON(*)
+# Apple A7 1.76 5.62
+# Cortex-A53 1.45 8.39
+# Cortex-A57 2.22 7.61
+#
+# (*) presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0"; # argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+$code.=<<___;
+.global gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ vld1.64 {$t1},[x1] @ load H
+ vmov.i8 $t0,#0xe1
+ vext.8 $IN,$t1,$t1,#8
+ vshl.i64 $t0,$t0,#57
+ vshr.u64 $t2,$t0,#63
+ vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
+ vdup.32 $t1,${t1}[1]
+ vshr.u64 $t3,$IN,#63
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
+ vand $t3,$t3,$t0
+ vshl.i64 $IN,$IN,#1
+ vext.8 $t3,$t3,$t3,#8
+ vand $t0,$t0,$t1
+ vorr $IN,$IN,$t3 @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vst1.64 {$IN},[x0]
+
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+
+.global gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $t3,#0xe1
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ vshl.u64 $t3,$t3,#57
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $Hhl,$H,$H,#8
+ mov $len,#0
+ vext.8 $IN,$t1,$t1,#8
+ mov $inc,#0
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ mov $inp,$Xi
+ b .Lgmult_v8
+.size gcm_gmult_v8,.-gcm_gmult_v8
+
+.global gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ subs $len,$len,#16
+ vmov.i8 $t3,#0xe1
+ mov $inc,#16
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ cclr $inc,eq
+ vext.8 $Xl,$Xl,$Xl,#8
+ vshl.u64 $t3,$t3,#57
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ vext.8 $Hhl,$H,$H,#8
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+ vrev64.8 $t1,$t1
+#endif
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ vext.8 $IN,$t1,$t1,#8
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vext.8 $t2,$Xl,$Xl,#8
+ veor $IN,$IN,$Xl @ inp^=Xi
+ veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ subs $len,$len,#16
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ cclr $inc,eq
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$t3 @ 1st phase
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ veor $Xl,$Xm,$t2
+ vext.8 $IN,$t1,$t1,#8
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vpmull.p64 $Xl,$Xl,$t3
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ b.hs .Loop_v8
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vmov\s+(.*)/unvmov($1)/geo or
+ s/vext\.8/ext/o or
+ s/vshr\.s/sshr\.s/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8(\s)/$1/o;
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush