aboutsummaryrefslogtreecommitdiff
path: root/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'openssl/crypto/aes/asm/aesni-mb-x86_64.pl')
-rwxr-xr-xopenssl/crypto/aes/asm/aesni-mb-x86_64.pl1395
1 files changed, 1395 insertions, 0 deletions
diff --git a/openssl/crypto/aes/asm/aesni-mb-x86_64.pl b/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
new file mode 100755
index 000000000..33b1aed3c
--- /dev/null
+++ b/openssl/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -0,0 +1,1395 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer AES-NI procedures process several independent buffers
+# in parallel by interleaving independent instructions.
+#
+# Cycles per byte for interleave factor 4:
+#
+# asymptotic measured
+# ---------------------------
+# Westmere 5.00/4=1.25 5.13/4=1.28
+# Atom 15.0/4=3.75 ?15.7/4=3.93
+# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
+# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
+# Haswell 4.44/4=1.11 4.44/4=1.11
+# Bulldozer 5.75/4=1.44 5.76/4=1.44
+#
+# Cycles per byte for interleave factor 8 (not implemented for
+# pre-AVX processors, where higher interleave factor incidentally
+# doesn't result in improvement):
+#
+# asymptotic measured
+# ---------------------------
+# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
+# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
+# Haswell 5.00/8=0.63 5.00/8=0.63
+# Bulldozer 5.75/8=0.72 5.77/8=0.72
+#
+# (*) Sandy/Ivy Bridge are known to handle high interleave factors
+# suboptimally;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_multi_cbc_encrypt (
+# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
+# const AES_KEY *key,
+# int num); /* 1 or 2 */
+#
+$inp="%rdi"; # 1st arg
+$key="%rsi"; # 2nd arg
+$num="%edx";
+
+@inptr=map("%r$_",(8..11));
+@outptr=map("%r$_",(12..15));
+
+($rndkey0,$rndkey1)=("%xmm0","%xmm1");
+@out=map("%xmm$_",(2..5));
+@inp=map("%xmm$_",(6..9));
+($counters,$mask,$zero)=map("%xmm$_",(10..12));
+
+($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_multi_cbc_encrypt
+.type aesni_multi_cbc_encrypt,\@function,3
+.align 32
+aesni_multi_cbc_encrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Lenc_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Lenc4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ pxor $zero,@out[0]
+ movups 0x20-0x78($key),$rndkey0
+ pxor $zero,@out[1]
+ mov 0xf0-0x78($key),$rounds
+ pxor $zero,@out[2]
+ movdqu (@inptr[0]),@inp[0] # load inputs
+ pxor $zero,@out[3]
+ movdqu (@inptr[1]),@inp[1]
+ pxor @inp[0],@out[0]
+ movdqu (@inptr[2]),@inp[2]
+ pxor @inp[1],@out[1]
+ movdqu (@inptr[3]),@inp[3]
+ pxor @inp[2],@out[2]
+ pxor @inp[3],@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesenc $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesenc $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[2],$offset)
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesenc $rndkey,@out[0]
+ aesenc $rndkey,@out[1]
+ aesenc $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesenc $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesenc $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesenc $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesenc $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesenc $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movdqu (@inptr[0],$offset),@inp[0]
+ movdqu 0x10-0x78($key),$rndkey1
+
+ aesenclast $rndkey0,@out[0]
+ movdqu (@inptr[1],$offset),@inp[1]
+ pxor $zero,@inp[0]
+ aesenclast $rndkey0,@out[1]
+ movdqu (@inptr[2],$offset),@inp[2]
+ pxor $zero,@inp[1]
+ aesenclast $rndkey0,@out[2]
+ movdqu (@inptr[3],$offset),@inp[3]
+ pxor $zero,@inp[2]
+ aesenclast $rndkey0,@out[3]
+ movdqu 0x20-0x78($key),$rndkey0
+ pxor $zero,@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ pxor @inp[0],@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ pxor @inp[1],@out[1]
+ movups @out[2],-16(@outptr[2],$offset)
+ pxor @inp[2],@out[2]
+ movups @out[3],-16(@outptr[3],$offset)
+ pxor @inp[3],@out[3]
+
+ dec $num
+ jnz .Loop_enc4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ #pxor @inp[0],@out[0]
+ #pxor @inp[1],@out[1]
+ #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
+ #pxor @inp[2],@out[2]
+ #movdqu @out[1],`40*1+24-40*2`($inp)
+ #pxor @inp[3],@out[3]
+ #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
+ #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Lenc4x_loop_grande
+
+.Lenc4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc4x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_decrypt,\@function,3
+.align 32
+aesni_multi_cbc_decrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Ldec_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Ldec4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ movups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ movdqu (@inptr[0]),@out[0] # load inputs
+ movdqu (@inptr[1]),@out[1]
+ pxor $zero,@out[0]
+ movdqu (@inptr[2]),@out[2]
+ pxor $zero,@out[1]
+ movdqu (@inptr[3]),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesdec $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesdec $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[3],$offset)
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesdec $rndkey,@out[0]
+ aesdec $rndkey,@out[1]
+ aesdec $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesdec $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesdec $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesdec $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesdec $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesdec $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ pxor $rndkey0,@inp[0]
+ pxor $rndkey0,@inp[1]
+ aesdec $rndkey1,@out[3]
+ movdqu 0x10-0x78($key),$rndkey1
+ pxor $rndkey0,@inp[2]
+ pxor $rndkey0,@inp[3]
+ movdqu 0x20-0x78($key),$rndkey0
+
+ aesdeclast @inp[0],@out[0]
+ aesdeclast @inp[1],@out[1]
+ movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
+ movdqu -16(@inptr[1],$offset),@inp[1]
+ aesdeclast @inp[2],@out[2]
+ aesdeclast @inp[3],@out[3]
+ movdqu -16(@inptr[2],$offset),@inp[2]
+ movdqu -16(@inptr[3],$offset),@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ movdqu (@inptr[0],$offset),@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ movdqu (@inptr[1],$offset),@out[1]
+ pxor $zero,@out[0]
+ movups @out[2],-16(@outptr[2],$offset)
+ movdqu (@inptr[2],$offset),@out[2]
+ pxor $zero,@out[1]
+ movups @out[3],-16(@outptr[3],$offset)
+ movdqu (@inptr[3],$offset),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+
+ dec $num
+ jnz .Loop_dec4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Ldec4x_loop_grande
+
+.Ldec4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec4x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+___
+
+ if ($avx) {{{
+my @ptr=map("%r$_",(8..15));
+my $offload=$sink;
+
+my @out=map("%xmm$_",(2..9));
+my @inp=map("%xmm$_",(10..13));
+my ($counters,$zero)=("%xmm14","%xmm15");
+
+$code.=<<___;
+.type aesni_multi_cbc_encrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+
+ sub \$192,%rsp
+ and \$-128,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Lenc8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+
+ vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
+ lea 128(%rsp),$offload # offload area
+ vpxor (@ptr[1]),$zero,@inp[1]
+ vpxor (@ptr[2]),$zero,@inp[2]
+ vpxor (@ptr[3]),$zero,@inp[3]
+ vpxor @inp[0],@out[0],@out[0]
+ vpxor (@ptr[4]),$zero,@inp[0]
+ vpxor @inp[1],@out[1],@out[1]
+ vpxor (@ptr[5]),$zero,@inp[1]
+ vpxor @inp[2],@out[2],@out[2]
+ vpxor (@ptr[6]),$zero,@inp[2]
+ vpxor @inp[3],@out[3],@out[3]
+ vpxor (@ptr[7]),$zero,@inp[3]
+ vpxor @inp[0],@out[4],@out[4]
+ mov \$1,$one # constant of 1
+ vpxor @inp[1],@out[5],@out[5]
+ vpxor @inp[2],@out[6],@out[6]
+ vpxor @inp[3],@out[7],@out[7]
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesenc $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesenc $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesenc $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesenc $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesenc $rndkey,@out[6],@out[6]
+ vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
+ mov $offset,64+8*$i(%rsp)
+ vaesenc $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4)
+ vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Lenc8x_tail:
+ vaesenc $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesenc $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesenclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesenclast $rndkey0,@out[1],@out[1]
+ vaesenclast $rndkey0,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenclast $rndkey0,@out[3],@out[3]
+ vaesenclast $rndkey0,@out[4],@out[4]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesenclast $rndkey0,@out[5],@out[5]
+ vaesenclast $rndkey0,@out[6],@out[6]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesenclast $rndkey0,@out[7],@out[7]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vpxor 0x00($offload),@out[0],@out[0]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vpxor @inp[0],@out[4],@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vpxor @inp[1],@out[5],@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vpxor @inp[2],@out[6],@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vpxor @inp[3],@out[7],@out[7]
+
+ dec $num
+ jnz .Loop_enc8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Lenc8x_loop_grande
+
+.Lenc8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc8x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_decrypt_avx:
+_avx_cbc_dec_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+ # +192 IV/input offload
+
+ sub \$256,%rsp
+ and \$-256,%rsp
+ sub \$192,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Ldec8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+ vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ lea 192+128(%rsp),$offload # offload area
+
+ vmovdqu (@ptr[0]),@out[0] # load inputs
+ vmovdqu (@ptr[1]),@out[1]
+ vmovdqu (@ptr[2]),@out[2]
+ vmovdqu (@ptr[3]),@out[3]
+ vmovdqu (@ptr[4]),@out[4]
+ vmovdqu (@ptr[5]),@out[5]
+ vmovdqu (@ptr[6]),@out[6]
+ vmovdqu (@ptr[7]),@out[7]
+ vmovdqu @out[0],0x00($offload) # offload inputs
+ vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @out[4],0x40($offload)
+ vpxor $zero,@out[4],@out[4]
+ vmovdqu @out[5],0x50($offload)
+ vpxor $zero,@out[5],@out[5]
+ vmovdqu @out[6],0x60($offload)
+ vpxor $zero,@out[6],@out[6]
+ vmovdqu @out[7],0x70($offload)
+ vpxor $zero,@out[7],@out[7]
+ xor \$0x80,$offload
+ mov \$1,$one # constant of 1
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesdec $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesdec $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesdec $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesdec $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesdec $rndkey,@out[6],@out[6]
+ vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
+ mov $offset,64+8*$i(%rsp)
+ vaesdec $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4);
+ vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Ldec8x_tail:
+ vaesdec $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesdec $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesdeclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesdeclast $rndkey0,@out[1],@out[1]
+ vpxor 0x00($offload),@out[0],@out[0] # xor with IV
+ vaesdeclast $rndkey0,@out[2],@out[2]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdeclast $rndkey0,@out[3],@out[3]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vaesdeclast $rndkey0,@out[4],@out[4]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesdeclast $rndkey0,@out[5],@out[5]
+ vpxor 0x40($offload),@out[4],@out[4]
+ vaesdeclast $rndkey0,@out[6],@out[6]
+ vpxor 0x50($offload),@out[5],@out[5]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesdeclast $rndkey0,@out[7],@out[7]
+ vpxor 0x60($offload),@out[6],@out[6]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vmovdqu 128+0(%rsp),@out[0]
+ vpxor 0x70($offload),@out[7],@out[7]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vmovdqu @out[0],0x00($offload)
+ vpxor $zero,@out[0],@out[0]
+ vmovdqu 128+16(%rsp),@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu 128+32(%rsp),@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu 128+48(%rsp),@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @inp[0],0x40($offload)
+ vpxor @inp[0],$zero,@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vmovdqu @inp[1],0x50($offload)
+ vpxor @inp[1],$zero,@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vmovdqu @inp[2],0x60($offload)
+ vpxor @inp[2],$zero,@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vmovdqu @inp[3],0x70($offload)
+ vpxor @inp[3],$zero,@out[7]
+
+ xor \$128,$offload
+ dec $num
+ jnz .Loop_dec8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Ldec8x_loop_grande
+
+.Ldec8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec8x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
+___
+ }}}
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 16(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
+ mov %r15,240($context) # restore cotnext->R15
+
+ lea -56-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt
+ .rva .LSEH_end_aesni_multi_cbc_encrypt
+ .rva .LSEH_info_aesni_multi_cbc_encrypt
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt
+ .rva .LSEH_end_aesni_multi_cbc_decrypt
+ .rva .LSEH_info_aesni_multi_cbc_decrypt
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_multi_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_multi_cbc_encrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
+___
+}
+####################################################################
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;