diff options
Diffstat (limited to 'openssl/crypto/modes/asm')
-rwxr-xr-x | openssl/crypto/modes/asm/aesni-gcm-x86_64.pl | 1057 | ||||
-rw-r--r-- | openssl/crypto/modes/asm/ghash-armv4.pl | 232 | ||||
-rw-r--r-- | openssl/crypto/modes/asm/ghash-s390x.pl | 6 | ||||
-rw-r--r-- | openssl/crypto/modes/asm/ghash-sparcv9.pl | 247 | ||||
-rw-r--r-- | openssl/crypto/modes/asm/ghash-x86.pl | 199 | ||||
-rw-r--r-- | openssl/crypto/modes/asm/ghash-x86_64.pl | 1149 | ||||
-rwxr-xr-x | openssl/crypto/modes/asm/ghashp8-ppc.pl | 234 | ||||
-rwxr-xr-x | openssl/crypto/modes/asm/ghashv8-armx.pl | 241 |
8 files changed, 3100 insertions, 265 deletions
diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl new file mode 100755 index 000000000..7e4e04ea2 --- /dev/null +++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -0,0 +1,1057 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, and 0.74 - +# on Broadwell. [Mentioned results are raw profiled measurements for +# favourable packet size, one divisible by 96. Applications using the +# EVP interface will observe a few percent worse performance.] +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($avx>1) {{{ + +($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +($Ii,$T1,$T2,$Hkey, + $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); + +($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); + +($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); + +$code=<<___; +.text + +.type _aesni_ctr32_ghash_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_ghash_6x: + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + sub \$6,$len + vpxor $Z0,$Z0,$Z0 # $Z0 = 0 + vmovdqu 0x00-0x80($key),$rndkey + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpaddb $T2,$inout2,$inout3 + vpaddb $T2,$inout3,$inout4 + vpaddb $T2,$inout4,$inout5 + vpxor $rndkey,$T1,$inout0 + vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 + jmp .Loop6x + +.align 32 +.Loop6x: + add \$`6<<24`,$counter + jc .Lhandle_ctr32 # discard $inout[1-5]? + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddb $T2,$inout5,$T1 # next counter value + vpxor $rndkey,$inout1,$inout1 + vpxor $rndkey,$inout2,$inout2 + +.Lresume_ctr32: + vmovdqu $T1,($ivp) # save next counter value + vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 + vpxor $rndkey,$inout3,$inout3 + vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey + vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 + xor %r12,%r12 + cmp $in0,$end0 + + vaesenc $T2,$inout0,$inout0 + vmovdqu 0x30+8(%rsp),$Ii # I[4] + vpxor $rndkey,$inout4,$inout4 + vpclmulqdq \$0x00,$Hkey,$Z3,$T1 + vaesenc $T2,$inout1,$inout1 + vpxor $rndkey,$inout5,$inout5 + setnc %r12b + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vaesenc $T2,$inout2,$inout2 + vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 + neg %r12 + vaesenc $T2,$inout3,$inout3 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 + vpxor $Z0,$Xi,$Xi # modulo-scheduled + vaesenc $T2,$inout4,$inout4 + vpxor $Z1,$T1,$Z0 + and \$0x60,%r12 + vmovups 0x20-0x80($key),$rndkey + vpclmulqdq \$0x10,$Hkey,$Ii,$T1 + vaesenc $T2,$inout5,$inout5 + + vpclmulqdq \$0x01,$Hkey,$Ii,$T2 + lea ($in0,%r12),$in0 + vaesenc $rndkey,$inout0,$inout0 + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] + vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey + vmovdqu 0x40+8(%rsp),$Ii # I[3] + vaesenc $rndkey,$inout1,$inout1 + movbe 0x58($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x50($in0),%r12 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x20+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x28+8(%rsp) + vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x30-0x80($key),$rndkey + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x00,$Z1,$Ii,$T1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x10,$Z1,$Ii,$T2 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Hkey,$Z3,$Z3 + vpclmulqdq \$0x01,$Z1,$Ii,$Hkey + vaesenc $rndkey,$inout2,$inout2 + vpclmulqdq \$0x11,$Z1,$Ii,$Z1 + vmovdqu 0x50+8(%rsp),$Ii # I[2] + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vpxor $T1,$Z0,$Z0 + vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x40-0x80($key),$rndkey + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x00,$T1,$Ii,$T2 + vaesenc $rndkey,$inout0,$inout0 + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x10,$T1,$Ii,$Hkey + vaesenc $rndkey,$inout1,$inout1 + movbe 0x48($in0),%r13 + vpxor $Z1,$Z3,$Z3 + vpclmulqdq \$0x01,$T1,$Ii,$Z1 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x40($in0),%r12 + vpclmulqdq \$0x11,$T1,$Ii,$T1 + vmovdqu 0x60+8(%rsp),$Ii # I[1] + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x30+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x38+8(%rsp) + vpxor $T2,$Z0,$Z0 + vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x50-0x80($key),$rndkey + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x00,$T2,$Ii,$Hkey + vaesenc $rndkey,$inout0,$inout0 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$T2,$Ii,$Z1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x38($in0),%r13 + vpxor $T1,$Z3,$Z3 + vpclmulqdq \$0x01,$T2,$Ii,$T1 + vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] + vaesenc $rndkey,$inout2,$inout2 + movbe 0x30($in0),%r12 + vpclmulqdq \$0x11,$T2,$Ii,$T2 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x40+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x48+8(%rsp) + vpxor $Hkey,$Z0,$Z0 + vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x60-0x80($key),$rndkey + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x01,$Hkey,$Xi,$T1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x28($in0),%r13 + vpxor $T2,$Z3,$Z3 + vpclmulqdq \$0x00,$Hkey,$Xi,$T2 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x20($in0),%r12 + vpclmulqdq \$0x11,$Hkey,$Xi,$Xi + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x50+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x58+8(%rsp) + vpxor $Z1,$Z2,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor $T1,$Z2,$Z2 + + vmovups 0x70-0x80($key),$rndkey + vpslldq \$8,$Z2,$Z1 + vpxor $T2,$Z0,$Z0 + vmovdqu 0x10($const),$Hkey # .Lpoly + + vaesenc $rndkey,$inout0,$inout0 + vpxor $Xi,$Z3,$Z3 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Z1,$Z0,$Z0 + movbe 0x18($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x10($in0),%r12 + vpalignr \$8,$Z0,$Z0,$Ii # 1st phase + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + mov %r13,0x60+8(%rsp) + vaesenc $rndkey,$inout3,$inout3 + mov %r12,0x68+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vmovups 0x90-0x80($key),$rndkey + vaesenc $T1,$inout1,$inout1 + vpsrldq \$8,$Z2,$Z2 + vaesenc $T1,$inout2,$inout2 + vpxor $Z2,$Z3,$Z3 + vaesenc $T1,$inout3,$inout3 + vpxor $Ii,$Z0,$Z0 + movbe 0x08($in0),%r13 + vaesenc $T1,$inout4,$inout4 + movbe 0x00($in0),%r12 + vaesenc $T1,$inout5,$inout5 + vmovups 0xa0-0x80($key),$T1 + cmp \$11,$rounds + jb .Lenc_tail # 128-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xb0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xc0-0x80($key),$T1 + je .Lenc_tail # 192-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xd0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xe0-0x80($key),$T1 + jmp .Lenc_tail # 256-bit key + +.align 32 +.Lhandle_ctr32: + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $rndkey,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $rndkey,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpshufb $Ii,$inout5,$inout5 + vpshufb $Ii,$T1,$T1 # next counter value + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc $rndkey,$inout0,$inout0 + vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi + vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase + vaesenc $rndkey,$inout1,$inout1 + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + vpxor 0x00($inp),$T1,$T2 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x10($inp),$T1,$Ii + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x20($inp),$T1,$Z1 + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x30($inp),$T1,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x40($inp),$T1,$Z3 + vpxor 0x50($inp),$T1,$Hkey + vmovdqu ($ivp),$T1 # load next counter value + + vaesenclast $T2,$inout0,$inout0 + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + vaesenclast $Ii,$inout1,$inout1 + vpaddb $T2,$T1,$Ii + mov %r13,0x70+8(%rsp) + lea 0x60($inp),$inp + vaesenclast $Z1,$inout2,$inout2 + vpaddb $T2,$Ii,$Z1 + mov %r12,0x78+8(%rsp) + lea 0x60($out),$out + vmovdqu 0x00-0x80($key),$rndkey + vaesenclast $Z2,$inout3,$inout3 + vpaddb $T2,$Z1,$Z2 + vaesenclast $Z3, $inout4,$inout4 + vpaddb $T2,$Z2,$Z3 + vaesenclast $Hkey,$inout5,$inout5 + vpaddb $T2,$Z3,$Hkey + + add \$0x60,$ret + sub \$0x6,$len + jc .L6x_done + + vmovups $inout0,-0x60($out) # save output + vpxor $rndkey,$T1,$inout0 + vmovups $inout1,-0x50($out) + vmovdqa $Ii,$inout1 # 0 latency + vmovups $inout2,-0x40($out) + vmovdqa $Z1,$inout2 # 0 latency + vmovups $inout3,-0x30($out) + vmovdqa $Z2,$inout3 # 0 latency + vmovups $inout4,-0x20($out) + vmovdqa $Z3,$inout4 # 0 latency + vmovups $inout5,-0x10($out) + vmovdqa $Hkey,$inout5 # 0 latency + vmovdqu 0x20+8(%rsp),$Z3 # I[5] + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled + vpxor $Z0,$Xi,$Xi # modulo-scheduled + + ret +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +___ +###################################################################### +# +# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, +# const AES_KEY *key, unsigned char iv[16], +# struct { u128 Xi,H,Htbl[9]; } *Xip); +$code.=<<___; +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@function,6 +.align 32 +aesni_gcm_decrypt: + xor $ret,$ret + cmp \$0x60,$len # minimal accepted length + jb .Lgcm_dec_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_dec_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + vmovdqu ($Xip),$Xi # load Xi + and \$-128,%rsp # ensure stack alignment + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + lea 0x80($key),$key # size optimization + lea 0x20+0x20($Xip),$Xip # size optimization + mov 0xf0-0x80($key),$rounds + vpshufb $Ii,$Xi,$Xi + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Ldec_no_key_aliasing + cmp \$768,$end0 + jnc .Ldec_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Ldec_no_key_aliasing: + + vmovdqu 0x50($inp),$Z3 # I[5] + lea ($inp),$in0 + vmovdqu 0x40($inp),$Z0 + lea -0xc0($inp,$len),$end0 + vmovdqu 0x30($inp),$Z1 + shr \$4,$len + xor $ret,$ret + vmovdqu 0x20($inp),$Z2 + vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu 0x10($inp),$T2 + vpshufb $Ii,$Z0,$Z0 + vmovdqu ($inp),$Hkey + vpshufb $Ii,$Z1,$Z1 + vmovdqu $Z0,0x30(%rsp) + vpshufb $Ii,$Z2,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$T2,$T2 + vmovdqu $Z2,0x50(%rsp) + vpshufb $Ii,$Hkey,$Hkey + vmovdqu $T2,0x60(%rsp) + vmovdqu $Hkey,0x70(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups $inout0,-0x60($out) # save output + vmovups $inout1,-0x50($out) + vmovups $inout2,-0x40($out) + vmovups $inout3,-0x30($out) + vmovups $inout4,-0x20($out) + vmovups $inout5,-0x10($out) + + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xd8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_dec_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ + +$code.=<<___; +.type _aesni_ctr32_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_6x: + vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + lea -1($rounds),%r13 + vmovups 0x10-0x80($key),$rndkey + lea 0x20-0x80($key),%r12 + vpxor $Z0,$T1,$inout0 + add \$`6<<24`,$counter + jc .Lhandle_ctr32_2 + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddb $T2,$inout2,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddb $T2,$inout3,$inout4 + vpxor $Z0,$inout3,$inout3 + vpaddb $T2,$inout4,$inout5 + vpxor $Z0,$inout4,$inout4 + vpaddb $T2,$inout5,$T1 + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + vmovups (%r12),$rndkey + lea 0x10(%r12),%r12 + dec %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),$Hkey # last round key + vaesenc $rndkey,$inout0,$inout0 + vpxor 0x00($inp),$Hkey,$Z0 + vaesenc $rndkey,$inout1,$inout1 + vpxor 0x10($inp),$Hkey,$Z1 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x20($inp),$Hkey,$Z2 + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x30($inp),$Hkey,$Xi + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x40($inp),$Hkey,$T2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x50($inp),$Hkey,$Hkey + lea 0x60($inp),$inp + + vaesenclast $Z0,$inout0,$inout0 + vaesenclast $Z1,$inout1,$inout1 + vaesenclast $Z2,$inout2,$inout2 + vaesenclast $Xi,$inout3,$inout3 + vaesenclast $T2,$inout4,$inout4 + vaesenclast $Hkey,$inout5,$inout5 + vmovups $inout0,0x00($out) + vmovups $inout1,0x10($out) + vmovups $inout2,0x20($out) + vmovups $inout3,0x30($out) + vmovups $inout4,0x40($out) + vmovups $inout5,0x50($out) + lea 0x60($out),$out + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpxor $Z0,$inout3,$inout3 + vpshufb $Ii,$inout5,$inout5 + vpxor $Z0,$inout4,$inout4 + vpshufb $Ii,$T1,$T1 # next counter value + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@function,6 +.align 32 +aesni_gcm_encrypt: + xor $ret,$ret + cmp \$0x60*3,$len # minimal accepted length + jb .Lgcm_enc_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_enc_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + lea 0x80($key),$key # size optimization + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + and \$-128,%rsp # ensure stack alignment + mov 0xf0-0x80($key),$rounds + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Lenc_no_key_aliasing + cmp \$768,$end0 + jnc .Lenc_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Lenc_no_key_aliasing: + + lea ($out),$in0 + lea -0xc0($out,$len),$end0 + shr \$4,$len + + call _aesni_ctr32_6x + vpshufb $Ii,$inout0,$Xi # save bswapped output on stack + vpshufb $Ii,$inout1,$T2 + vmovdqu $Xi,0x70(%rsp) + vpshufb $Ii,$inout2,$Z0 + vmovdqu $T2,0x60(%rsp) + vpshufb $Ii,$inout3,$Z1 + vmovdqu $Z0,0x50(%rsp) + vpshufb $Ii,$inout4,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu $Z2,0x30(%rsp) + + call _aesni_ctr32_6x + + vmovdqu ($Xip),$Xi # load Xi + lea 0x20+0x20($Xip),$Xip # size optimization + sub \$12,$len + mov \$0x60*2,$ret + vpshufb $Ii,$Xi,$Xi + + call _aesni_ctr32_ghash_6x + vmovdqu 0x20(%rsp),$Z3 # I[5] + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $Z3,$Z3,$T1 + vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK + vmovups $inout0,-0x60($out) # save output + vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy + vpxor $Z3,$T1,$T1 + vmovups $inout1,-0x50($out) + vpshufb $Ii,$inout1,$inout1 + vmovups $inout2,-0x40($out) + vpshufb $Ii,$inout2,$inout2 + vmovups $inout3,-0x30($out) + vpshufb $Ii,$inout3,$inout3 + vmovups $inout4,-0x20($out) + vpshufb $Ii,$inout4,$inout4 + vmovups $inout5,-0x10($out) + vpshufb $Ii,$inout5,$inout5 + vmovdqu $inout0,0x10(%rsp) # free $inout0 +___ +{ my ($HK,$T3)=($rndkey,$inout0); + +$code.=<<___; + vmovdqu 0x30(%rsp),$Z2 # I[4] + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpunpckhqdq $Z2,$Z2,$T2 + vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 + vpxor $Z2,$T2,$T2 + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + + vmovdqu 0x40(%rsp),$T3 # I[3] + vpclmulqdq \$0x00,$Ii,$Z2,$Z0 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $T3,$T3,$Z1 + vpclmulqdq \$0x11,$Ii,$Z2,$Z2 + vpxor $T3,$Z1,$Z1 + vpxor $Z3,$Z2,$Z2 + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vmovdqu 0x50(%rsp),$T1 # I[2] + vpclmulqdq \$0x00,$Hkey,$T3,$Z3 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z0,$Z3,$Z3 + vpunpckhqdq $T1,$T1,$Z0 + vpclmulqdq \$0x11,$Hkey,$T3,$T3 + vpxor $T1,$Z0,$Z0 + vpxor $Z2,$T3,$T3 + vpclmulqdq \$0x00,$HK,$Z1,$Z1 + vpxor $T2,$Z1,$Z1 + + vmovdqu 0x60(%rsp),$T2 # I[1] + vpclmulqdq \$0x00,$Ii,$T1,$Z2 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z3,$Z2,$Z2 + vpunpckhqdq $T2,$T2,$Z3 + vpclmulqdq \$0x11,$Ii,$T1,$T1 + vpxor $T2,$Z3,$Z3 + vpxor $T3,$T1,$T1 + vpclmulqdq \$0x10,$HK,$Z0,$Z0 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $Z1,$Z0,$Z0 + + vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] + vpclmulqdq \$0x00,$Hkey,$T2,$Z1 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpunpckhqdq $Xi,$Xi,$T3 + vpxor $Z2,$Z1,$Z1 + vpclmulqdq \$0x11,$Hkey,$T2,$T2 + vpxor $Xi,$T3,$T3 + vpxor $T1,$T2,$T2 + vpclmulqdq \$0x00,$HK,$Z3,$Z3 + vpxor $Z0,$Z3,$Z0 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $inout5,$inout5,$T1 + vpclmulqdq \$0x11,$Ii,$Xi,$Xi + vpxor $inout5,$T1,$T1 + vpxor $Z1,$Z2,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$T3 + vmovdqu 0x20-0x20($Xip),$HK + vpxor $T2,$Xi,$Z3 + vpxor $Z0,$T3,$Z2 + + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 + vpxor $T3,$Z2,$Z2 + vpunpckhqdq $inout4,$inout4,$T2 + vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 + vpxor $inout4,$T2,$T2 + vpslldq \$8,$Z2,$T3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + vpxor $T3,$Z1,$Xi + vpsrldq \$8,$Z2,$Z2 + vpxor $Z2,$Z3,$Z3 + + vpclmulqdq \$0x00,$Ii,$inout4,$Z1 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout3,$inout3,$T3 + vpclmulqdq \$0x11,$Ii,$inout4,$inout4 + vpxor $inout3,$T3,$T3 + vpxor $inout5,$inout4,$inout4 + vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $inout2,$inout2,$T1 + vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 + vpxor $inout2,$T1,$T1 + vpxor $inout4,$inout3,$inout3 + vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 + vpclmulqdq \$0x00,$HK,$T3,$T3 + vpxor $T2,$T3,$T3 + + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Ii,$inout2,$Z1 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout1,$inout1,$T2 + vpclmulqdq \$0x11,$Ii,$inout2,$inout2 + vpxor $inout1,$T2,$T2 + vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase + vpxor $inout3,$inout2,$inout2 + vpclmulqdq \$0x10,$HK,$T1,$T1 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $T3,$T1,$T1 + + vxorps $Z3,$inout5,$inout5 + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $Xi,$Xi,$T3 + vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 + vpxor $Xi,$T3,$T3 + vpxor $inout2,$inout1,$inout1 + vpclmulqdq \$0x00,$HK,$T2,$T2 + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z1 + vpclmulqdq \$0x11,$Ii,$Xi,$Z3 + vpxor $Z0,$Z1,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$Z2 + vpxor $inout1,$Z3,$Z3 + vpxor $T2,$Z2,$Z2 + + vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing + vpxor $Z0,$Z2,$Z2 + vpslldq \$8,$Z2,$T1 + vmovdqu 0x10($const),$Hkey # .Lpoly + vpsrldq \$8,$Z2,$Z2 + vpxor $T1,$Z1,$Xi + vpxor $Z2,$Z3,$Z3 + + vpalignr \$8,$Xi,$Xi,$T2 # 1st phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $Z3,$T2,$T2 + vpxor $T2,$Xi,$Xi +___ +} +$code.=<<___; + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_enc_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +___ + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: + .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ +.extern __imp_RtlVirtualUnwind +.type gcm_se_handler,\@abi-omnipotent +.align 16 +gcm_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 120($context),%rax # pull context->Rax + + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + mov %r15,240($context) + mov %r14,232($context) + mov %r13,224($context) + mov %r12,216($context) + mov %rbp,160($context) + mov %rbx,144($context) + + lea -0xd8(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size gcm_se_handler,.-gcm_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_gcm_decrypt + .rva .LSEH_end_aesni_gcm_decrypt + .rva .LSEH_gcm_dec_info + + .rva .LSEH_begin_aesni_gcm_encrypt + .rva .LSEH_end_aesni_gcm_encrypt + .rva .LSEH_gcm_enc_info +.section .xdata +.align 8 +.LSEH_gcm_dec_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_dec_body,.Lgcm_dec_abort +.LSEH_gcm_enc_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_enc_body,.Lgcm_enc_abort +___ +} +}}} else {{{ +$code=<<___; # assembler is too old +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@abi-omnipotent +aesni_gcm_encrypt: + xor %eax,%eax + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@abi-omnipotent +aesni_gcm_decrypt: + xor %eax,%eax + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ +}}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl index d91586ee2..77fbf3446 100644 --- a/openssl/crypto/modes/asm/ghash-armv4.pl +++ b/openssl/crypto/modes/asm/ghash-armv4.pl @@ -35,6 +35,20 @@ # Add NEON implementation featuring polynomial multiplication, i.e. no # lookup tables involved. On Cortex A8 it was measured to process one # byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - +# in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # ==================================================================== # Note about "528B" variant. In ARM case it makes lesser sense to @@ -303,117 +317,161 @@ $code.=<<___; .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ { -my $cnt=$Htbl; # $Htbl is used once in the very beginning - -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); -my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); - -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit -# in Zo. Or should I say "top bit", because GHASH is specified in -# reverse bit order? Otherwise straightforward 128-bt H by one input -# byte multiplication and modulo-reduction, times 16. +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 +___ +} $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 $IN#hi,[r1,:64]! @ load H + vmov.i8 $t0,#0xe1 + vld1.64 $IN#lo,[r1,:64] + vshl.i64 $t0#hi,#57 + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 + vdup.8 $t1,$IN#hi[7] + vshr.u64 $Hlo,$IN#lo,#63 + vshr.s8 $t1,#7 @ broadcast carry bit + vshl.i64 $IN,$IN,#1 + vand $t0,$t0,$t1 + vorr $IN#hi,$Hlo @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vstmia r0,{$IN} + + ret @ bx lr +.size gcm_init_neon,.-gcm_init_neon + .global gcm_gmult_neon .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: - sub $Htbl,#16 @ point at H in GCM128_CTX - vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi - vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$IN")`,[$Xi,:64]! - vshr.u64 $mod,#32 - vldmia $Htbl,{$Hhi-$Hlo} @ load H - veor $zero,$zero + vld1.64 $IN#hi,[$Xi,:64]! @ load Xi + vld1.64 $IN#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif - veor $Qpost,$Qpost - veor $R,$R - mov $cnt,#16 - veor $Z,$Z + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing mov $len,#16 - veor $Zo,$Zo - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte - b .Linner_neon + b .Lgmult_neon .size gcm_gmult_neon,.-gcm_gmult_neon .global gcm_ghash_neon .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: - vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi - vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$Z")`,[$Xi,:64]! - vshr.u64 $mod,#32 - vldmia $Xi,{$Hhi-$Hlo} @ load H - veor $zero,$zero - nop + vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi + vld1.64 $Xl#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff #ifdef __ARMEL__ - vrev64.8 $Z,$Z + vrev64.8 $Xl,$Xl #endif -.Louter_neon: - vld1.64 `&Dhi($IN)`,[$inp]! @ load inp - veor $Qpost,$Qpost - vld1.64 `&Dlo($IN)`,[$inp]! - veor $R,$R - mov $cnt,#16 + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 $IN#hi,[$inp]! @ load inp + vld1.64 $IN#lo,[$inp]! #ifdef __ARMEL__ vrev64.8 $IN,$IN #endif - veor $Zo,$Zo - veor $IN,$Z @ inp^=Xi - veor $Z,$Z - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte -.Linner_neon: - subs $cnt,$cnt,#1 - vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] - vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] - vext.8 $IN,$zero,#1 @ IN>>=8 - - veor $Z,$Qpost @ modulo-scheduled part - vshl.i64 `&Dlo("$R")`,#48 - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte - veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` - - veor `&Dhi("$Z")`,`&Dlo("$R")` - vuzp.8 $Qlo,$Qhi - vsli.8 $Zo,$T,#1 @ compose the "carry" byte - vext.8 $Z,$zero,#1 @ Z>>=8 - - vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 - vshr.u8 $Zo,$T,#7 @ save Z's bottom bit - vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 - veor $Z,$Qhi - bne .Linner_neon - - veor $Z,$Qpost @ modulo-scheduled artefact - vshl.i64 `&Dlo("$R")`,#48 - veor `&Dhi("$Z")`,`&Dlo("$R")` - - @ finalization, normalize Z:Zo - vand $Zo,$mod @ suffices to mask the bit - vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 - vshl.i64 $Z,#1 + veor $IN,$Xl @ inp^=Xi +.Lgmult_neon: +___ + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo +$code.=<<___; + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing +___ + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi +$code.=<<___; + veor $Xm,$Xm,$Xl @ Karatsuba post-processing + veor $Xm,$Xm,$Xh + veor $Xl#hi,$Xl#hi,$Xm#lo + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 $t1,$Xl,#57 @ 1st phase + vshl.i64 $t2,$Xl,#62 + veor $t2,$t2,$t1 @ + vshl.i64 $t1,$Xl,#63 + veor $t2, $t2, $t1 @ + veor $Xl#hi,$Xl#hi,$t2#lo @ + veor $Xh#lo,$Xh#lo,$t2#hi + + vshr.u64 $t2,$Xl,#1 @ 2nd phase + veor $Xh,$Xh,$Xl + veor $Xl,$Xl,$t2 @ + vshr.u64 $t2,$t2,#6 + vshr.u64 $Xl,$Xl,#1 @ + veor $Xl,$Xl,$Xh @ + veor $Xl,$Xl,$t2 @ + subs $len,#16 - vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 - bne .Louter_neon + bne .Loop_neon #ifdef __ARMEL__ - vrev64.8 $Z,$Z + vrev64.8 $Xl,$Xl #endif sub $Xi,#16 - vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi - vst1.64 `&Dlo("$Z")`,[$Xi,:64] + vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi + vst1.64 $Xl#lo,[$Xi,:64] - bx lr + ret @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif ___ @@ -423,7 +481,13 @@ $code.=<<___; .align 2 ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} close STDOUT; # enforce flush diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl index 6a40d5d89..39096b423 100644 --- a/openssl/crypto/modes/asm/ghash-s390x.pl +++ b/openssl/crypto/modes/asm/ghash-s390x.pl @@ -186,13 +186,13 @@ $code.=<<___; sllg $rem1,$Zlo,3 xgr $Zlo,$tmp ngr $rem1,$x78 + sllg $tmp,$Zhi,60 j .Lghash_inner .align 16 .Lghash_inner: srlg $Zlo,$Zlo,4 - sllg $tmp,$Zhi,60 - xg $Zlo,8($nlo,$Htbl) srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) llgc $xi,0($cnt,$Xi) xg $Zhi,0($nlo,$Htbl) sllg $nlo,$xi,4 @@ -213,9 +213,9 @@ $code.=<<___; sllg $rem1,$Zlo,3 xgr $Zlo,$tmp ngr $rem1,$x78 + sllg $tmp,$Zhi,60 brct $cnt,.Lghash_inner - sllg $tmp,$Zhi,60 srlg $Zlo,$Zlo,4 srlg $Zhi,$Zhi,4 xg $Zlo,8($nlo,$Htbl) diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl index 70e7b044a..0365e0f1f 100644 --- a/openssl/crypto/modes/asm/ghash-sparcv9.pl +++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl @@ -36,6 +36,15 @@ # references to input data and Z.hi updates to achieve 12 cycles # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. +# +# October 2012 +# +# Add VIS3 lookup-table-free implementation using polynomial +# multiplication xmulx[hi] and extended addition addxc[cc] +# instructions. 4.52/7.63x improvement on T3/T4 or in absolute +# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark +# saturates at ~15.5x single-process result on 8-core processor, +# or ~20.5GBps per 2.85GHz socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } @@ -66,6 +75,10 @@ $Htbl="%i1"; $inp="%i2"; $len="%i3"; +$code.=<<___ if ($bits==64); +.register %g2,#scratch +.register %g3,#scratch +___ $code.=<<___; .section ".text",#alloc,#execinstr @@ -321,10 +334,238 @@ gcm_gmult_4bit: restore .type gcm_gmult_4bit,#function .size gcm_gmult_4bit,(.-gcm_gmult_4bit) -.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +___ + +{{{ +# Straightforward 128x128-bit multiplication using Karatsuba algorithm +# followed by pair of 64-bit reductions [with a shortcut in first one, +# which allowed to break dependency between reductions and remove one +# multiplication from critical path]. While it might be suboptimal +# with regard to sheer number of multiplications, other methods [such +# as aggregate reduction] would require more 64-bit registers, which +# we don't have in 32-bit application context. + +($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); + +($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= + (map("%o$_",(0..5,7)),map("%g$_",(1..5))); + +($shl,$shr)=map("%l$_",(0..7)); + +# For details regarding "twisted H" see ghash-x86.pl. +$code.=<<___; +.globl gcm_init_vis3 +.align 32 +gcm_init_vis3: + save %sp,-$frame,%sp + + ldx [%i1+0],$Hhi + ldx [%i1+8],$Hlo + mov 0xE1,$Xhi + mov 1,$Xlo + sllx $Xhi,57,$Xhi + srax $Hhi,63,$C0 ! broadcast carry + addcc $Hlo,$Hlo,$Hlo ! H<<=1 + addxc $Hhi,$Hhi,$Hhi + and $C0,$Xlo,$Xlo + and $C0,$Xhi,$Xhi + xor $Xlo,$Hlo,$Hlo + xor $Xhi,$Hhi,$Hhi + stx $Hlo,[%i0+8] ! save twisted H + stx $Hhi,[%i0+0] + + sethi %hi(0xA0406080),$V + sethi %hi(0x20C0E000),%l0 + or $V,%lo(0xA0406080),$V + or %l0,%lo(0x20C0E000),%l0 + sllx $V,32,$V + or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 + stx $V,[%i0+16] + + ret + restore +.type gcm_init_vis3,#function +.size gcm_init_vis3,.-gcm_init_vis3 + +.globl gcm_gmult_vis3 +.align 32 +gcm_gmult_vis3: + save %sp,-$frame,%sp + + ldx [$Xip+8],$Xlo ! load Xi + ldx [$Xip+0],$Xhi + ldx [$Htable+8],$Hlo ! load twisted H + ldx [$Htable+0],$Hhi + + mov 0xE1,%l7 + sllx %l7,57,$xE1 ! 57 is not a typo + ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 + + xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing + xmulx $Xlo,$Hlo,$C0 + xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing + xmulx $C2,$Hhl,$C1 + xmulxhi $Xlo,$Hlo,$Xlo + xmulxhi $C2,$Hhl,$C2 + xmulxhi $Xhi,$Hhi,$C3 + xmulx $Xhi,$Hhi,$Xhi + + sll $C0,3,$sqr + srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] + xor $C0,$sqr,$sqr + sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] + + xor $C0,$C1,$C1 ! Karatsuba post-processing + xor $Xlo,$C2,$C2 + xor $sqr,$Xlo,$Xlo ! real destination is $C1 + xor $C3,$C2,$C2 + xor $Xlo,$C1,$C1 + xor $Xhi,$C2,$C2 + xor $Xhi,$C1,$C1 + + xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 + xor $C0,$C2,$C2 + xmulx $C1,$xE1,$C0 + xor $C1,$C3,$C3 + xmulxhi $C1,$xE1,$C1 + + xor $Xlo,$C2,$C2 + xor $C0,$C2,$C2 + xor $C1,$C3,$C3 + + stx $C2,[$Xip+8] ! save Xi + stx $C3,[$Xip+0] + + ret + restore +.type gcm_gmult_vis3,#function +.size gcm_gmult_vis3,.-gcm_gmult_vis3 + +.globl gcm_ghash_vis3 +.align 32 +gcm_ghash_vis3: + save %sp,-$frame,%sp + + ldx [$Xip+8],$C2 ! load Xi + ldx [$Xip+0],$C3 + ldx [$Htable+8],$Hlo ! load twisted H + ldx [$Htable+0],$Hhi + + mov 0xE1,%l7 + sllx %l7,57,$xE1 ! 57 is not a typo + ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 + + and $inp,7,$shl + andn $inp,7,$inp + sll $shl,3,$shl + prefetch [$inp+63], 20 + sub %g0,$shl,$shr + + xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing +.Loop: + ldx [$inp+8],$Xlo + brz,pt $shl,1f + ldx [$inp+0],$Xhi + + ldx [$inp+16],$C1 ! align data + srlx $Xlo,$shr,$C0 + sllx $Xlo,$shl,$Xlo + sllx $Xhi,$shl,$Xhi + srlx $C1,$shr,$C1 + or $C0,$Xhi,$Xhi + or $C1,$Xlo,$Xlo +1: + add $inp,16,$inp + sub $len,16,$len + xor $C2,$Xlo,$Xlo + xor $C3,$Xhi,$Xhi + prefetch [$inp+63], 20 + + xmulx $Xlo,$Hlo,$C0 + xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing + xmulx $C2,$Hhl,$C1 + xmulxhi $Xlo,$Hlo,$Xlo + xmulxhi $C2,$Hhl,$C2 + xmulxhi $Xhi,$Hhi,$C3 + xmulx $Xhi,$Hhi,$Xhi + + sll $C0,3,$sqr + srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] + xor $C0,$sqr,$sqr + sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] + + xor $C0,$C1,$C1 ! Karatsuba post-processing + xor $Xlo,$C2,$C2 + xor $sqr,$Xlo,$Xlo ! real destination is $C1 + xor $C3,$C2,$C2 + xor $Xlo,$C1,$C1 + xor $Xhi,$C2,$C2 + xor $Xhi,$C1,$C1 + + xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 + xor $C0,$C2,$C2 + xmulx $C1,$xE1,$C0 + xor $C1,$C3,$C3 + xmulxhi $C1,$xE1,$C1 + + xor $Xlo,$C2,$C2 + xor $C0,$C2,$C2 + brnz,pt $len,.Loop + xor $C1,$C3,$C3 + + stx $C2,[$Xip+8] ! save Xi + stx $C3,[$Xip+0] + + ret + restore +.type gcm_ghash_vis3,#function +.size gcm_ghash_vis3,.-gcm_ghash_vis3 +___ +}}} +$code.=<<___; +.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" .align 4 ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "xmulx" => 0x115, + "xmulxhi" => 0x116 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + close STDOUT; diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl index 83c727e07..23a5527b3 100644 --- a/openssl/crypto/modes/asm/ghash-x86.pl +++ b/openssl/crypto/modes/asm/ghash-x86.pl @@ -12,25 +12,27 @@ # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that it # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two -# code paths: vanilla x86 and vanilla MMX. Former will be executed on -# 486 and Pentium, latter on all others. MMX GHASH features so called +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called # "528B" variant of "4-bit" method utilizing additional 256+16 bytes # of per-key storage [+512 bytes shared table]. Performance results # are for streamed GHASH subroutine and are expressed in cycles per # processed byte, less is better: # -# gcc 2.95.3(*) MMX assembler x86 assembler +# gcc 2.95.3(*) SSE assembler x86 assembler # # Pentium 105/111(**) - 50 # PIII 68 /75 12.2 24 # P4 125/125 17.8 84(***) # Opteron 66 /70 10.1 30 # Core2 54 /67 8.4 18 +# Atom 105/105 16.8 53 +# VIA Nano 69 /71 13.0 27 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, # another reason is lack of 3.4.x results for older CPUs; -# comparison with MMX results is not completely fair, because C +# comparison with SSE results is not completely fair, because C # results are for vanilla "256B" implementation, while # assembler results are for "528B";-) # (**) second number is result for code compiled with -fPIC flag, @@ -40,8 +42,8 @@ # # To summarize, it's >2-5 times faster than gcc-generated code. To # anchor it to something else SHA1 assembler processes one byte in -# 11-13 cycles on contemporary x86 cores. As for choice of MMX in -# particular, see comment at the end of the file... +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... # May 2010 # @@ -113,6 +115,16 @@ # similar manner resulted in almost 20% degradation on Sandy Bridge, # where original 64-bit code processes one byte in 1.95 cycles. +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; @@ -822,17 +834,18 @@ $len="ebx"; &static_label("bswap"); sub clmul64x64_T2 { # minimal "register" pressure -my ($Xhi,$Xi,$Hkey)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_; &movdqa ($Xhi,$Xi); # &pshufd ($T1,$Xi,0b01001110); - &pshufd ($T2,$Hkey,0b01001110); + &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); &pxor ($T1,$Xi); # - &pxor ($T2,$Hkey); + &pxor ($T2,$Hkey) if (!defined($HK)); + $HK=$T2 if (!defined($HK)); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &pclmulqdq ($T1,$T2,0x00); ####### + &pclmulqdq ($T1,$HK,0x00); ####### &xorps ($T1,$Xi); # &xorps ($T1,$Xhi); # @@ -879,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist. # below. Algorithm 9 was therefore chosen for # further optimization... -sub reduction_alg9 { # 17/13 times faster than Intel version +sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; # 1st phase - &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # - &psllq ($Xi,5); # - &pxor ($Xi,$T1); # &psllq ($Xi,57); # - &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T2,8); # - &pxor ($Xi,$T1); - &pxor ($Xhi,$T2); # + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # # 2nd phase &movdqa ($T2,$Xi); + &psrlq ($Xi,1); + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - &pxor ($T2,$Xhi); - &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # + &pxor ($Xi,$Xhi) # } &function_begin_B("gcm_init_clmul"); @@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_; &clmul64x64_T2 ($Xhi,$Xi,$Hkey); &reduction_alg9 ($Xhi,$Xi); + &pshufd ($T1,$Hkey,0b01001110); + &pshufd ($T2,$Xi,0b01001110); + &pxor ($T1,$Hkey); # Karatsuba pre-processing &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &pxor ($T2,$Xi); # Karatsuba pre-processing &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + &palignr ($T2,$T1,8); # low part is H.lo^H.hi + &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" &ret (); &function_end_B("gcm_init_clmul"); @@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_; &movdqa ($T3,&QWP(0,$const)); &movups ($Hkey,&QWP(0,$Htbl)); &pshufb ($Xi,$T3); + &movups ($T2,&QWP(32,$Htbl)); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); &reduction_alg9 ($Xhi,$Xi); &pshufb ($Xi,$T3); @@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_; &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pshufb ($T1,$T3); &pshufb ($Xn,$T3); + &movdqu ($T3,&QWP(32,$Htbl)); &pxor ($Xi,$T1); # Ii+Xi - &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($T1,$Xn); # + &lea ($inp,&DWP(32,$inp)); # i+=2 + + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T3,0x00); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &nop (); - &lea ($inp,&DWP(32,$inp)); # i+=2 &sub ($len,0x20); &jbe (&label("even_tail")); + &jmp (&label("mod_loop")); -&set_label("mod_loop"); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) - &movdqu ($T1,&QWP(0,$inp)); # Ii - &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("mod_loop",32); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + &nop (); - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movups ($Hkey,&QWP(0,$Htbl)); # load H - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pshufb ($T1,$T3); - &pshufb ($Xn,$T3); + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &movdqa ($T3,&QWP(0,$const)); + &xorps ($Xhi,$Xhn); + &movdqu ($Xhn,&QWP(0,$inp)); # Ii + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pxor ($T1,$Xhi); # - &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 - &movdqa ($Xhn,$Xn); - &pxor ($Xhi,$T1); # "Ii+Xi", consume early + &pshufb ($Xhn,$T3); + &pxor ($T2,$T1); # - &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + &pshufb ($Xn,$T3); + &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early + + &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # - &psllq ($Xi,5); # - &pxor ($Xi,$T1); # &pclmulqdq ($Xn,$Hkey,0x00); ####### + &movups ($T3,&QWP(32,$Htbl)); &psllq ($Xi,57); # - &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T2,8); # - &pxor ($Xi,$T1); - &pshufd ($T1,$T3,0b01001110); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + &pshufd ($T1,$Xhn,0b01001110); + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,1); + &pxor ($T1,$Xhn); &pxor ($Xhi,$T2); # - &pxor ($T1,$T3); - &pshufd ($T3,$Hkey,0b01001110); - &pxor ($T3,$Hkey); # - &pclmulqdq ($Xhn,$Hkey,0x11); ####### - &movdqa ($T2,$Xi); # 2nd phase + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - &pxor ($T2,$Xhi); - &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - + &pxor ($Xi,$Xhi) # &pclmulqdq ($T1,$T3,0x00); ####### - &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 - &xorps ($T1,$Xn); # - &xorps ($T1,$Xhn); # - - &movdqa ($T3,$T1); # - &psrldq ($T1,8); - &pslldq ($T3,8); # - &pxor ($Xhn,$T1); - &pxor ($Xn,$T3); # - &movdqa ($T3,&QWP(0,$const)); &lea ($inp,&DWP(32,$inp)); &sub ($len,0x20); &ja (&label("mod_loop")); &set_label("even_tail"); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &pxor ($T1,$Xhi); # + + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # &reduction_alg9 ($Xhi,$Xi); @@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_; &set_label("bswap",64); &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial -}} # $sse2 - -&set_label("rem_4bit",64); - &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); - &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); - &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); - &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); &set_label("rem_8bit",64); &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); @@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_; &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); }}} # !$x86only &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl index 38d779edb..6e656ca13 100644 --- a/openssl/crypto/modes/asm/ghash-x86_64.pl +++ b/openssl/crypto/modes/asm/ghash-x86_64.pl @@ -22,6 +22,8 @@ # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results @@ -39,6 +41,44 @@ # providing access to a Westmere-based system on behalf of Intel # Open Source Technology Centre. +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, and in +# 0.29 on Broadwell. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +$do4xaggr=1; + # common register layout $nlo="%rax"; $nhi="%rbx"; @@ -160,6 +221,7 @@ ___ $code=<<___; .text +.extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 @@ -352,19 +414,27 @@ ___ ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); sub clmul64x64_T2 { # minimal register pressure -my ($Xhi,$Xi,$Hkey,$modulo)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_; -$code.=<<___ if (!defined($modulo)); +if (!defined($HK)) { $HK = $T2; +$code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Hkey,$T2 pxor $Xi,$T1 # pxor $Hkey,$T2 ___ +} else { +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 # +___ +} $code.=<<___; pclmulqdq \$0x00,$Hkey,$Xi ####### pclmulqdq \$0x11,$Hkey,$Xhi ####### - pclmulqdq \$0x00,$T2,$T1 ####### + pclmulqdq \$0x00,$HK,$T1 ####### pxor $Xi,$T1 # pxor $Xhi,$T1 # @@ -376,42 +446,53 @@ $code.=<<___; ___ } -sub reduction_alg9 { # 17/13 times faster than Intel version +sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; $code.=<<___; # 1st phase - movdqa $Xi,$T1 # + movdqa $Xi,$T2 # + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # - psllq \$5,$Xi # - pxor $T1,$Xi # psllq \$57,$Xi # - movdqa $Xi,$T2 # + movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T2 # - pxor $T1,$Xi - pxor $T2,$Xhi # + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # # 2nd phase movdqa $Xi,$T2 + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # - pxor $T2,$Xi # - pxor $Xhi,$T2 - psrlq \$1,$Xi # - pxor $T2,$Xi # + pxor $Xhi,$Xi # ___ } { my ($Htbl,$Xip)=@_4args; + my $HK="%xmm6"; $code.=<<___; .globl gcm_init_clmul .type gcm_init_clmul,\@abi-omnipotent .align 16 gcm_init_clmul: +.L_init_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; movdqu ($Xip),$Hkey pshufd \$0b01001110,$Hkey,$Hkey # dword swap @@ -430,13 +511,47 @@ gcm_init_clmul: pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial # calculate H^2 + pshufd \$0b01001110,$Hkey,$HK movdqa $Hkey,$Xi + pxor $Hkey,$HK ___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); &reduction_alg9 ($Xhi,$Xi); $code.=<<___; - movdqu $Hkey,($Htbl) # save H - movdqu $Xi,16($Htbl) # save H^2 + pshufd \$0b01001110,$Hkey,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $Hkey,$T1 # Karatsuba pre-processing + movdqu $Hkey,0x00($Htbl) # save H + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x10($Htbl) # save H^2 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x20($Htbl) # save Karatsuba "salt" +___ +if ($do4xaggr) { + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqa $Xi,$T3 +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$T3,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $T3,$T1 # Karatsuba pre-processing + movdqu $T3,0x30($Htbl) # save H^3 + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x40($Htbl) # save H^4 + palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... + movdqu $T2,0x50($Htbl) # save Karatsuba "salt" +___ +} +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_clmul: +___ +$code.=<<___; ret .size gcm_init_clmul,.-gcm_init_clmul ___ @@ -449,13 +564,38 @@ $code.=<<___; .type gcm_gmult_clmul,\@abi-omnipotent .align 16 gcm_gmult_clmul: +.L_gmult_clmul: movdqu ($Xip),$Xi movdqa .Lbswap_mask(%rip),$T3 movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$T2 pshufb $T3,$Xi ___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); - &reduction_alg9 ($Xhi,$Xi); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); +$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); + # experimental alternative. special thing about is that there + # no dependency between the two multiplications... + mov \$`0xE1<<1`,%eax + mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff + mov \$0x07,%r11d + movq %rax,$T1 + movq %r10,$T2 + movq %r11,$T3 # borrow $T3 + pand $Xi,$T3 + pshufb $T3,$T2 # ($Xi&7)·0xE0 + movq %rax,$T3 + pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) + pxor $Xi,$T2 + pslldq \$15,$T2 + paddd $T2,$T2 # <<(64+56+1) + pxor $T2,$Xi + pclmulqdq \$0x01,$T3,$Xi + movdqa .Lbswap_mask(%rip),$T3 # reload $T3 + psrldq \$1,$T1 + pxor $T1,$Xhi + pslldq \$7,$Xi + pxor $Xhi,$Xi +___ $code.=<<___; pshufb $T3,$Xi movdqu $Xi,($Xip) @@ -465,129 +605,327 @@ ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; - my $Xn="%xmm6"; - my $Xhn="%xmm7"; - my $Hkey2="%xmm8"; - my $T1n="%xmm9"; - my $T2n="%xmm10"; + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); + my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul .type gcm_ghash_clmul,\@abi-omnipotent -.align 16 +.align 32 gcm_ghash_clmul: +.L_ghash_clmul: ___ $code.=<<___ if ($win64); + lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) - .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) - .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) - .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$HK pshufb $T3,$Xi sub \$0x10,$len jz .Lodd_tail - movdqu 16($Htbl),$Hkey2 + movdqu 0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; + mov OPENSSL_ia32cap_P+4(%rip),%eax + cmp \$0x30,$len + jb .Lskip4x + + and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE + cmp \$`1<<22`,%eax # check for MOVBE without XSAVE + je .Lskip4x + + sub \$0x30,$len + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff + movdqu 0x30($Htbl),$Hkey3 + movdqu 0x40($Htbl),$Hkey4 + + ####### + # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P + # + movdqu 0x30($inp),$Xln + movdqu 0x20($inp),$Xl + pshufb $T3,$Xln + pshufb $T3,$Xl + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey2,$Xl + pclmulqdq \$0x11,$Hkey2,$Xh + pclmulqdq \$0x10,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + xorps $Xm,$Xmn + + movdqu 0x10($inp),$Xl + movdqu 0($inp),$T1 + pshufb $T3,$Xl + pshufb $T3,$T1 + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $T1,$Xi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + pclmulqdq \$0x11,$Hkey3,$Xh + pclmulqdq \$0x00,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + movdqu 0x30($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + movdqu 0x20($inp),$Xln + movdqa $Xl,$Xh + pclmulqdq \$0x10,$HK,$T1 + pshufd \$0b01001110,$Xl,$Xm + xorps $Xhn,$Xhi + pxor $Xl,$Xm + pshufb $T3,$Xln + movups 0x20($Htbl),$HK + xorps $Xmn,$T1 + pclmulqdq \$0x00,$Hkey,$Xl + pshufd \$0b01001110,$Xln,$Xmn + + pxor $Xi,$T1 # aggregated Karatsuba post-processing + movdqa $Xln,$Xhn + pxor $Xhi,$T1 # + pxor $Xln,$Xmn + movdqa $T1,$T2 # + pclmulqdq \$0x11,$Hkey,$Xh + pslldq \$8,$T1 + psrldq \$8,$T2 # + pxor $T1,$Xi + movdqa .L7_mask(%rip),$T1 + pxor $T2,$Xhi # + movq %rax,$T2 + + pand $Xi,$T1 # 1st phase + pshufb $T1,$T2 # + pxor $Xi,$T2 # + pclmulqdq \$0x00,$HK,$Xm + psllq \$57,$T2 # + movdqa $T2,$T1 # + pslldq \$8,$T2 + pclmulqdq \$0x00,$Hkey2,$Xln + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + movdqu 0($inp),$T1 + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhn + xorps $Xl,$Xln + movdqu 0x10($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x10,$HK,$Xmn + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + pshufb $T3,$T1 + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + + movdqa $Xl,$Xh + pxor $Xm,$Xmn + pshufd \$0b01001110,$Xl,$Xm + pxor $T2,$Xi # + pxor $T1,$Xhi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + psrlq \$1,$Xi # + pxor $Xhi,$Xi # + movdqa $Xi,$Xhi + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq \$0x00,$Hkey4,$Xi + pclmulqdq \$0x11,$Hkey4,$Xhi + pclmulqdq \$0x10,$HK,$T1 + xorps $Xm,$Xmn + xorps $Xln,$Xi + xorps $Xhn,$Xhi + pxor $Xi,$Xhi # aggregated Karatsuba post-processing + pxor $Xmn,$T1 + + pxor $Xhi,$T1 # + pxor $Xi,$Xhi + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ + &reduction_alg9($Xhi,$Xi); +$code.=<<___; + add \$0x40,$len + jz .Ldone + movdqu 0x20($Htbl),$HK + sub \$0x10,$len + jz .Lodd_tail +.Lskip4x: +___ +} +$code.=<<___; ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # movdqu ($inp),$T1 # Ii - movdqu 16($inp),$Xn # Ii+1 + movdqu 16($inp),$Xln # Ii+1 pshufb $T3,$T1 - pshufb $T3,$Xn + pshufb $T3,$Xln pxor $T1,$Xi # Ii+Xi -___ - &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 -$code.=<<___; - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pshufd \$0b01001110,$Hkey2,$T2 - pxor $Xi,$T1 # - pxor $Hkey2,$T2 + + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 + nop sub \$0x20,$len jbe .Leven_tail + nop + jmp .Lmod_loop +.align 32 .Lmod_loop: -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) -$code.=<<___; - movdqu ($inp),$T1 # Ii - pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) - pxor $Xhn,$Xhi + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # - movdqu 16($inp),$Xn # Ii+1 - pshufb $T3,$T1 - pshufb $T3,$Xn + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn - movdqa $Xn,$Xhn # - pshufd \$0b01001110,$Xn,$T1n - pshufd \$0b01001110,$Hkey,$T2n - pxor $Xn,$T1n # - pxor $Hkey,$T2n - pxor $T1,$Xhi # "Ii+Xi", consume early + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + movdqu ($inp),$T2 # Ii + pxor $Xi,$T1 # aggregated Karatsuba post-processing + pshufb $T3,$T2 + movdqu 16($inp),$Xln # Ii+1 + + pxor $Xhi,$T1 + pxor $T2,$Xhi # "Ii+Xi", consume early + pxor $T1,$Xmn + pshufb $T3,$Xln + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # + + movdqa $Xln,$Xhn # - movdqa $Xi,$T1 # 1st phase + movdqa $Xi,$T2 # 1st phase + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # - psllq \$5,$Xi # - pxor $T1,$Xi # - pclmulqdq \$0x00,$Hkey,$Xn ####### psllq \$57,$Xi # - movdqa $Xi,$T2 # + movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T2 # - pxor $T1,$Xi - pxor $T2,$Xhi # + psrldq \$8,$T1 # + pxor $T2,$Xi + pshufd \$0b01001110,$Xhn,$Xmn + pxor $T1,$Xhi # + pxor $Xhn,$Xmn # - pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey,$Xhn ####### + pxor $T2,$Xhi # + pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # + lea 32($inp),$inp psrlq \$1,$Xi # - pxor $T2,$Xi # - pxor $Xhi,$T2 - psrlq \$1,$Xi # - pxor $T2,$Xi # + pclmulqdq \$0x00,$HK,$Xmn ####### + pxor $Xhi,$Xi # - pclmulqdq \$0x00,$T2n,$T1n ####### - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pshufd \$0b01001110,$Hkey2,$T2 - pxor $Xi,$T1 # - pxor $Hkey2,$T2 - - pxor $Xn,$T1n # - pxor $Xhn,$T1n # - movdqa $T1n,$T2n # - psrldq \$8,$T1n - pslldq \$8,$T2n # - pxor $T1n,$Xhn - pxor $T2n,$Xn # - - lea 32($inp),$inp sub \$0x20,$len ja .Lmod_loop .Leven_tail: -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) -$code.=<<___; - pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi + pxor $Xi,$T1 + pxor $Xhi,$T1 + pxor $T1,$Xmn + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; @@ -599,7 +937,7 @@ $code.=<<___; pshufb $T3,$T1 pxor $T1,$Xi # Ii+Xi ___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); $code.=<<___; .Ldone: @@ -612,21 +950,607 @@ $code.=<<___ if ($win64); movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 - add \$0x58,%rsp + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_clmul: ___ $code.=<<___; ret -.LSEH_end_gcm_ghash_clmul: .size gcm_ghash_clmul,.-gcm_ghash_clmul ___ } + +$code.=<<___; +.globl gcm_init_avx +.type gcm_init_avx,\@abi-omnipotent +.align 32 +gcm_init_avx: +___ +if ($avx) { +my ($Htbl,$Xip)=@_4args; +my $HK="%xmm6"; + +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Hkey + vpshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + vpsrlq \$63,$Hkey,$T1 + vpsllq \$1,$Hkey,$Hkey + vpxor $T3,$T3,$T3 # + vpcmpgtd $T2,$T3,$T3 # broadcast carry bit + vpslldq \$8,$T1,$T1 + vpor $T1,$Hkey,$Hkey # H<<=1 + + # magic reduction + vpand .L0x1c2_polynomial(%rip),$T3,$T3 + vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial + + vpunpckhqdq $Hkey,$Hkey,$HK + vmovdqa $Hkey,$Xi + vpxor $Hkey,$HK,$HK + mov \$4,%r10 # up to H^8 + jmp .Linit_start_avx +___ + +sub clmul64x64_avx { +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpunpckhqdq $Hkey,$Hkey,$T2 + vpxor $Xi,$T1,$T1 # + vpxor $Hkey,$T2,$T2 +___ +} else { +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpxor $Xi,$T1,$T1 # +___ +} +$code.=<<___; + vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### + vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### + vpclmulqdq \$0x00,$HK,$T1,$T1 ####### + vpxor $Xi,$Xhi,$T2 # + vpxor $T2,$T1,$T1 # + + vpslldq \$8,$T1,$T2 # + vpsrldq \$8,$T1,$T1 + vpxor $T2,$Xi,$Xi # + vpxor $T1,$Xhi,$Xhi +___ +} + +sub reduction_avx { +my ($Xhi,$Xi) = @_; + +$code.=<<___; + vpsllq \$57,$Xi,$T1 # 1st phase + vpsllq \$62,$Xi,$T2 + vpxor $T1,$T2,$T2 # + vpsllq \$63,$Xi,$T1 + vpxor $T1,$T2,$T2 # + vpslldq \$8,$T2,$T1 # + vpsrldq \$8,$T2,$T2 + vpxor $T1,$Xi,$Xi # + vpxor $T2,$Xhi,$Xhi + + vpsrlq \$1,$Xi,$T2 # 2nd phase + vpxor $Xi,$Xhi,$Xhi + vpxor $T2,$Xi,$Xi # + vpsrlq \$5,$T2,$T2 + vpxor $T2,$Xi,$Xi # + vpsrlq \$1,$Xi,$Xi # + vpxor $Xhi,$Xi,$Xi # +___ +} $code.=<<___; +.align 32 +.Linit_loop_avx: + vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... + vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; +.Linit_start_avx: + vmovdqa $Xi,$T3 +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; + vpshufd \$0b01001110,$T3,$T1 + vpshufd \$0b01001110,$Xi,$T2 + vpxor $T3,$T1,$T1 # Karatsuba pre-processing + vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 + vpxor $Xi,$T2,$T2 # Karatsuba pre-processing + vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 + lea 0x30($Htbl),$Htbl + sub \$1,%r10 + jnz .Linit_loop_avx + + vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped + vmovdqu $T3,-0x10($Htbl) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_avx: +___ +$code.=<<___; + ret +.size gcm_init_avx,.-gcm_init_avx +___ +} else { +$code.=<<___; + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +___ +} + +$code.=<<___; +.globl gcm_gmult_avx +.type gcm_gmult_avx,\@abi-omnipotent +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +___ + +$code.=<<___; +.globl gcm_ghash_avx +.type gcm_ghash_avx,\@abi-omnipotent +.align 32 +gcm_ghash_avx: +___ +if ($avx) { +my ($Xip,$Htbl,$inp,$len)=@_4args; +my ($Xlo,$Xhi,$Xmi, + $Zlo,$Zhi,$Zmi, + $Hkey,$HK,$T1,$T2, + $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); + +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Xi # load $Xi + lea .L0x1c2_polynomial(%rip),%r10 + lea 0x40($Htbl),$Htbl # size optimization + vmovdqu .Lbswap_mask(%rip),$bswap + vpshufb $bswap,$Xi,$Xi + cmp \$0x80,$len + jb .Lshort_avx + sub \$0x80,$len + + vmovdqu 0x70($inp),$Ii # I[7] + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpshufb $bswap,$Ii,$Ii + vmovdqu 0x20-0x40($Htbl),$HK + + vpunpckhqdq $Ii,$Ii,$T2 + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Ii,$T2,$T2 + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x50($inp),$Ii # I[5] + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpxor $Ii,$T2,$T2 + vmovdqu 0x40($inp),$Ij # I[4] + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + + vpshufb $bswap,$Ij,$Ij + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x30($inp),$Ii # I[3] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x20($inp),$Ij # I[2] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpxor $Xmi,$Zmi,$Zmi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x10($inp),$Ii # I[1] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu ($inp),$Ij # I[0] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x10,$HK,$T2,$Xmi + + lea 0x80($inp),$inp + cmp \$0x80,$len + jb .Ltail_avx + + vpxor $Xi,$Ij,$Ij # accumulate $Xi + sub \$0x80,$len + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x70($inp),$Ii # I[7] + vpxor $Xlo,$Zlo,$Zlo + vpxor $Ij,$T1,$T1 + vpclmulqdq \$0x00,$Hkey,$Ij,$Xi + vpshufb $bswap,$Ii,$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xo + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Tred + vmovdqu 0x20-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Zlo,$Xi,$Xi # collect result + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vxorps $Zhi,$Xo,$Xo + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Zmi,$Tred,$Tred + vxorps $Ij,$T1,$T1 + + vmovdqu 0x50($inp),$Ii # I[5] + vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Xo,$Tred,$Tred + vpslldq \$8,$Tred,$T2 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vpsrldq \$8,$Tred,$Tred + vpxor $T2, $Xi, $Xi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ii + vxorps $Tred,$Xo, $Xo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x40($inp),$Ij # I[4] + vpalignr \$8,$Xi,$Xi,$Tred # 1st phase + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vxorps $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + + vmovdqu 0x30($inp),$Ii # I[3] + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x20($inp),$Ij # I[2] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + vxorps $Tred,$Xi,$Xi + + vmovdqu 0x10($inp),$Ii # I[1] + vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vxorps $Xo,$Tred,$Tred + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu ($inp),$Ij # I[0] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Tred,$Ij,$Ij + vpclmulqdq \$0x10,$HK, $T2,$Xmi + vpxor $Xi,$Ij,$Ij # accumulate $Xi + + lea 0x80($inp),$inp + sub \$0x80,$len + jnc .Loop8x_avx + + add \$0x80,$len + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -0x10($inp,$len),$Ii # very last word + lea ($inp,$len),$inp + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vmovdqu 0x20-0x40($Htbl),$HK + vpshufb $bswap,$Ii,$Ij + + vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, + vmovdqa $Xhi,$Zhi # $Zhi and + vmovdqa $Xmi,$Zmi # $Zmi + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x20($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x30($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x50-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x40($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x50($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x80-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x60($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x70($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovq 0xb8-0x40($Htbl),$HK + sub \$0x10,$len + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor $Xi,$Ij,$Ij # accumulate $Xi +.Ltail_no_xor_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + + vmovdqu (%r10),$Tred + + vpxor $Xlo,$Zlo,$Xi + vpxor $Xhi,$Zhi,$Xo + vpxor $Xmi,$Zmi,$Zmi + + vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing + vpxor $Xo, $Zmi,$Zmi + vpslldq \$8, $Zmi,$T2 + vpsrldq \$8, $Zmi,$Zmi + vpxor $T2, $Xi, $Xi + vpxor $Zmi,$Xo, $Xo + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $Xo,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + cmp \$0,$len + jne .Lshort_avx + + vpshufb $bswap,$Xi,$Xi + vmovdqu $Xi,($Xip) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_avx: +___ +$code.=<<___; + ret +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} else { +$code.=<<___; + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} + +$code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: + .long 7,0,7,0 +.L7_mask_poly: + .long 7,0,`0xE1<<1`,0 .align 64 .type .Lrem_4bit,\@object .Lrem_4bit: @@ -774,10 +1698,24 @@ se_handler: .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit + .rva .LSEH_begin_gcm_init_clmul + .rva .LSEH_end_gcm_init_clmul + .rva .LSEH_info_gcm_init_clmul + .rva .LSEH_begin_gcm_ghash_clmul .rva .LSEH_end_gcm_ghash_clmul .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_gcm_init_avx + .rva .LSEH_end_gcm_init_avx + .rva .LSEH_info_gcm_init_clmul + .rva .LSEH_begin_gcm_ghash_avx + .rva .LSEH_end_gcm_ghash_avx + .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_gcm_gmult_4bit: @@ -788,14 +1726,23 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_init_clmul: + .byte 0x01,0x08,0x03,0x00 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 .LSEH_info_gcm_ghash_clmul: - .byte 0x01,0x1f,0x0b,0x00 - .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 - .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 - .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 ___ } diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl new file mode 100755 index 000000000..e76a58c34 --- /dev/null +++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,234 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my $vrsave="r12"; + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 +.align 5 +.gcm_init_p8: + lis r0,0xfff0 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $H,$H,$t1 # twisted H + + vsldoi $H,$H,$H,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + stvx_u $H, r9,r3 + stvx_u $Hh,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 + +.globl .gcm_gmult_p8 +.align 5 +.gcm_gmult_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 +.align 5 +.gcm_ghash_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subi $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + b Loop + +.align 5 +Loop: + subic $len,$len,16 + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + subfe. r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + add $inp,$inp,r0 + + vpmsumd $t2,$Xl,$xC2 # 1st phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,0,$inp + addi $inp,$inp,16 + + vsldoi $t1,$Xl,$Xl,8 # 2nd phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + beq Loop # did $len-=16 borrow? + + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +foreach (split("\n",$code)) { + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl new file mode 100755 index 000000000..54a1ac4db --- /dev/null +++ b/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,241 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# Current performance in cycles per processed byte: +# +# PMULL[2] 32-bit NEON(*) +# Apple A7 1.76 5.62 +# Cortex-A53 1.45 8.39 +# Cortex-A57 2.22 7.61 +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +open STDOUT,">".shift; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +$code.=<<___; +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {$t1},[x1] @ load H + vmov.i8 $t0,#0xe1 + vext.8 $IN,$t1,$t1,#8 + vshl.i64 $t0,$t0,#57 + vshr.u64 $t2,$t0,#63 + vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01 + vdup.32 $t1,${t1}[1] + vshr.u64 $t3,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t3,$t3,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t3,$t3,$t3,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t3 @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vst1.64 {$IN},[x0] + + ret +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $t3,#0xe1 + vld1.64 {$H},[$Htbl] @ load twisted H + vshl.u64 $t3,$t3,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $Hhl,$H,$H,#8 + mov $len,#0 + vext.8 $IN,$t1,$t1,#8 + mov $inc,#0 + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + mov $inp,$Xi + b .Lgmult_v8 +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + subs $len,$len,#16 + vmov.i8 $t3,#0xe1 + mov $inc,#16 + vld1.64 {$H},[$Htbl] @ load twisted H + cclr $inc,eq + vext.8 $Xl,$Xl,$Xl,#8 + vshl.u64 $t3,$t3,#57 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + vext.8 $Hhl,$H,$H,#8 +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl + vrev64.8 $t1,$t1 +#endif + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + vext.8 $IN,$t1,$t1,#8 + b .Loop_v8 + +.align 4 +.Loop_v8: + vext.8 $t2,$Xl,$Xl,#8 + veor $IN,$IN,$Xl @ inp^=Xi + veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi + +.Lgmult_v8: + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + subs $len,$len,#16 + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + cclr $inc,eq + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$t3 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + veor $Xl,$Xm,$t2 + vext.8 $IN,$t1,$t1,#8 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$t3 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + b.hs .Loop_v8 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remainig legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; # enforce flush |