8 files changed, 3100 insertions, 265 deletions
diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100755
index 000000000..7e4e04ea2
--- /dev/null
+++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	sub		\$6,$len
+	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
+	vmovdqu		0x00-0x80($key),$rndkey
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpaddb		$T2,$inout2,$inout3
+	vpaddb		$T2,$inout3,$inout4
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$rndkey,$T1,$inout0
+	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
+	jmp		.Loop6x
+
+.align	32
+.Loop6x:
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32		# discard $inout[1-5]?
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddb	$T2,$inout5,$T1		# next counter value
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpxor		$rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+	vmovdqu		$T1,($ivp)		# save next counter value
+	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
+	  vpxor		$rndkey,$inout3,$inout3
+	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
+	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+	xor		%r12,%r12
+	cmp		$in0,$end0
+
+	  vaesenc	$T2,$inout0,$inout0
+	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
+	  vpxor		$rndkey,$inout4,$inout4
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
+	  vaesenc	$T2,$inout1,$inout1
+	  vpxor		$rndkey,$inout5,$inout5
+	setnc		%r12b
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	  vaesenc	$T2,$inout2,$inout2
+	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
+	neg		%r12
+	  vaesenc	$T2,$inout3,$inout3
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
+	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+	  vaesenc	$T2,$inout4,$inout4
+	 vpxor		$Z1,$T1,$Z0
+	and		\$0x60,%r12
+	  vmovups	0x20-0x80($key),$rndkey
+	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
+	  vaesenc	$T2,$inout5,$inout5
+
+	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
+	lea		($in0,%r12),$in0
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
+	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x58($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x50($in0),%r12
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x20+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x28+8(%rsp)
+	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x30-0x80($key),$rndkey
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
+	  vaesenc	$rndkey,$inout1,$inout1
+	 vpxor		$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout2,$inout2
+	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
+	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	 vpxor		$T1,$Z0,$Z0
+	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x40-0x80($key),$rndkey
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T1,$Ii,$T2
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x48($in0),%r13
+	 vpxor		$Z1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x40($in0),%r12
+	vpclmulqdq	\$0x11,$T1,$Ii,$T1
+	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x30+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x38+8(%rsp)
+	 vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x50-0x80($key),$rndkey
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x38($in0),%r13
+	 vpxor		$T1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T2,$Ii,$T1
+	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x30($in0),%r12
+	vpclmulqdq	\$0x11,$T2,$Ii,$T2
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x40+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x48+8(%rsp)
+	 vpxor		$Hkey,$Z0,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x60-0x80($key),$rndkey
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x28($in0),%r13
+	 vpxor		$T2,$Z3,$Z3
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x20($in0),%r12
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x50+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x58+8(%rsp)
+	vpxor		$Z1,$Z2,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	vpxor		$T1,$Z2,$Z2
+
+	  vmovups	0x70-0x80($key),$rndkey
+	vpslldq		\$8,$Z2,$Z1
+	vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	vpxor		$Xi,$Z3,$Z3
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpxor		$Z1,$Z0,$Z0
+	movbe		0x18($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x10($in0),%r12
+	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	mov		%r13,0x60+8(%rsp)
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r12,0x68+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vmovups	0x90-0x80($key),$rndkey
+	  vaesenc	$T1,$inout1,$inout1
+	vpsrldq		\$8,$Z2,$Z2
+	  vaesenc	$T1,$inout2,$inout2
+	vpxor		$Z2,$Z3,$Z3
+	  vaesenc	$T1,$inout3,$inout3
+	vpxor		$Ii,$Z0,$Z0
+	movbe		0x08($in0),%r13
+	  vaesenc	$T1,$inout4,$inout4
+	movbe		0x00($in0),%r12
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xa0-0x80($key),$T1
+	  cmp		\$11,$rounds
+	  jb		.Lenc_tail		# 128-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xb0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xc0-0x80($key),$T1
+	  je		.Lenc_tail		# 192-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xd0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xe0-0x80($key),$T1
+	  jmp		.Lenc_tail		# 256-bit key
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
+	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
+	  vpaddd	$Z1,$Z2,$inout2
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddd	$Z1,$inout1,$inout3
+	  vpshufb	$Ii,$inout1,$inout1
+	  vpaddd	$Z1,$inout2,$inout4
+	  vpshufb	$Ii,$inout2,$inout2
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpaddd	$Z1,$inout3,$inout5
+	  vpshufb	$Ii,$inout3,$inout3
+	  vpxor		$rndkey,$inout2,$inout2
+	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
+	  vpshufb	$Ii,$inout4,$inout4
+	  vpshufb	$Ii,$inout5,$inout5
+	  vpshufb	$Ii,$T1,$T1		# next counter value
+	jmp		.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	  vaesenc	$rndkey,$inout0,$inout0
+	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
+	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	  vpxor		0x00($inp),$T1,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vpxor		0x10($inp),$T1,$Ii
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vpxor		0x20($inp),$T1,$Z1
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vpxor		0x30($inp),$T1,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	  vpxor		0x40($inp),$T1,$Z3
+	  vpxor		0x50($inp),$T1,$Hkey
+	  vmovdqu	($ivp),$T1		# load next counter value
+
+	  vaesenclast	$T2,$inout0,$inout0
+	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
+	  vaesenclast	$Ii,$inout1,$inout1
+	 vpaddb		$T2,$T1,$Ii
+	mov		%r13,0x70+8(%rsp)
+	lea		0x60($inp),$inp
+	  vaesenclast	$Z1,$inout2,$inout2
+	 vpaddb		$T2,$Ii,$Z1
+	mov		%r12,0x78+8(%rsp)
+	lea		0x60($out),$out
+	  vmovdqu	0x00-0x80($key),$rndkey
+	  vaesenclast	$Z2,$inout3,$inout3
+	 vpaddb		$T2,$Z1,$Z2
+	  vaesenclast	$Z3, $inout4,$inout4
+	 vpaddb		$T2,$Z2,$Z3
+	  vaesenclast	$Hkey,$inout5,$inout5
+	 vpaddb		$T2,$Z3,$Hkey
+
+	add		\$0x60,$ret
+	sub		\$0x6,$len
+	jc		.L6x_done
+
+	  vmovups	$inout0,-0x60($out)	# save output
+	 vpxor		$rndkey,$T1,$inout0
+	  vmovups	$inout1,-0x50($out)
+	 vmovdqa	$Ii,$inout1		# 0 latency
+	  vmovups	$inout2,-0x40($out)
+	 vmovdqa	$Z1,$inout2		# 0 latency
+	  vmovups	$inout3,-0x30($out)
+	 vmovdqa	$Z2,$inout3		# 0 latency
+	  vmovups	$inout4,-0x20($out)
+	 vmovdqa	$Z3,$inout4		# 0 latency
+	  vmovups	$inout5,-0x10($out)
+	 vmovdqa	$Hkey,$inout5		# 0 latency
+	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
+	jmp		.Loop6x
+
+.L6x_done:
+	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
+	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+
+	ret
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+#		const AES_KEY *key, unsigned char iv[16],
+#		struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@function,6
+.align	32
+aesni_gcm_decrypt:
+	xor	$ret,$ret
+	cmp	\$0x60,$len			# minimal accepted length
+	jb	.Lgcm_dec_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	vmovdqu		($Xip),$Xi		# load Xi
+	and		\$-128,%rsp		# ensure stack alignment
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	lea		0x80($key),$key		# size optimization
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	mov		0xf0-0x80($key),$rounds
+	vpshufb		$Ii,$Xi,$Xi
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+	vmovdqu		0x50($inp),$Z3		# I[5]
+	lea		($inp),$in0
+	vmovdqu		0x40($inp),$Z0
+	lea		-0xc0($inp,$len),$end0
+	vmovdqu		0x30($inp),$Z1
+	shr		\$4,$len
+	xor		$ret,$ret
+	vmovdqu		0x20($inp),$Z2
+	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		0x10($inp),$T2
+	 vpshufb	$Ii,$Z0,$Z0
+	vmovdqu		($inp),$Hkey
+	 vpshufb	$Ii,$Z1,$Z1
+	vmovdqu		$Z0,0x30(%rsp)
+	 vpshufb	$Ii,$Z2,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	 vpshufb	$Ii,$T2,$T2
+	vmovdqu		$Z2,0x50(%rsp)
+	 vpshufb	$Ii,$Hkey,$Hkey
+	vmovdqu		$T2,0x60(%rsp)
+	vmovdqu		$Hkey,0x70(%rsp)
+
+	call		_aesni_ctr32_ghash_6x
+
+	vmovups		$inout0,-0x60($out)	# save output
+	vmovups		$inout1,-0x50($out)
+	vmovups		$inout2,-0x40($out)
+	vmovups		$inout3,-0x30($out)
+	vmovups		$inout4,-0x20($out)
+	vmovups		$inout5,-0x10($out)
+
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xd8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_dec_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type	_aesni_ctr32_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	lea		-1($rounds),%r13
+	vmovups		0x10-0x80($key),$rndkey
+	lea		0x20-0x80($key),%r12
+	vpxor		$Z0,$T1,$inout0
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32_2
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddb		$T2,$inout2,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddb		$T2,$inout3,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpaddb		$T2,$inout5,$T1
+	vpxor		$Z0,$inout5,$inout5
+	jmp		.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc		$rndkey,$inout0,$inout0
+	vaesenc		$rndkey,$inout1,$inout1
+	vaesenc		$rndkey,$inout2,$inout2
+	vaesenc		$rndkey,$inout3,$inout3
+	vaesenc		$rndkey,$inout4,$inout4
+	vaesenc		$rndkey,$inout5,$inout5
+	vmovups		(%r12),$rndkey
+	lea		0x10(%r12),%r12
+	dec		%r13d
+	jnz		.Loop_ctr32
+
+	vmovdqu		(%r12),$Hkey		# last round key
+	vaesenc		$rndkey,$inout0,$inout0
+	vpxor		0x00($inp),$Hkey,$Z0
+	vaesenc		$rndkey,$inout1,$inout1
+	vpxor		0x10($inp),$Hkey,$Z1
+	vaesenc		$rndkey,$inout2,$inout2
+	vpxor		0x20($inp),$Hkey,$Z2
+	vaesenc		$rndkey,$inout3,$inout3
+	vpxor		0x30($inp),$Hkey,$Xi
+	vaesenc		$rndkey,$inout4,$inout4
+	vpxor		0x40($inp),$Hkey,$T2
+	vaesenc		$rndkey,$inout5,$inout5
+	vpxor		0x50($inp),$Hkey,$Hkey
+	lea		0x60($inp),$inp
+
+	vaesenclast	$Z0,$inout0,$inout0
+	vaesenclast	$Z1,$inout1,$inout1
+	vaesenclast	$Z2,$inout2,$inout2
+	vaesenclast	$Xi,$inout3,$inout3
+	vaesenclast	$T2,$inout4,$inout4
+	vaesenclast	$Hkey,$inout5,$inout5
+	vmovups		$inout0,0x00($out)
+	vmovups		$inout1,0x10($out)
+	vmovups		$inout2,0x20($out)
+	vmovups		$inout3,0x30($out)
+	vmovups		$inout4,0x40($out)
+	vmovups		$inout5,0x50($out)
+	lea		0x60($out),$out
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
+	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
+	vpaddd		$Z1,$Z2,$inout2
+	vpaddd		$Z1,$inout1,$inout3
+	vpshufb		$Ii,$inout1,$inout1
+	vpaddd		$Z1,$inout2,$inout4
+	vpshufb		$Ii,$inout2,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddd		$Z1,$inout3,$inout5
+	vpshufb		$Ii,$inout3,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
+	vpshufb		$Ii,$inout4,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpshufb		$Ii,$inout5,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpshufb		$Ii,$T1,$T1		# next counter value
+	vpxor		$Z0,$inout5,$inout5
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@function,6
+.align	32
+aesni_gcm_encrypt:
+	xor	$ret,$ret
+	cmp	\$0x60*3,$len			# minimal accepted length
+	jb	.Lgcm_enc_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	lea		0x80($key),$key		# size optimization
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	and		\$-128,%rsp		# ensure stack alignment
+	mov		0xf0-0x80($key),$rounds
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+	lea		($out),$in0
+	lea		-0xc0($out,$len),$end0
+	shr		\$4,$len
+
+	call		_aesni_ctr32_6x
+	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
+	vpshufb		$Ii,$inout1,$T2
+	vmovdqu		$Xi,0x70(%rsp)
+	vpshufb		$Ii,$inout2,$Z0
+	vmovdqu		$T2,0x60(%rsp)
+	vpshufb		$Ii,$inout3,$Z1
+	vmovdqu		$Z0,0x50(%rsp)
+	vpshufb		$Ii,$inout4,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		$Z2,0x30(%rsp)
+
+	call		_aesni_ctr32_6x
+
+	vmovdqu		($Xip),$Xi		# load Xi
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	sub		\$12,$len
+	mov		\$0x60*2,$ret
+	vpshufb		$Ii,$Xi,$Xi
+
+	call		_aesni_ctr32_ghash_6x
+	vmovdqu		0x20(%rsp),$Z3		# I[5]
+	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	vpunpckhqdq	$Z3,$Z3,$T1
+	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
+	 vmovups	$inout0,-0x60($out)	# save output
+	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
+	vpxor		$Z3,$T1,$T1
+	 vmovups	$inout1,-0x50($out)
+	 vpshufb	$Ii,$inout1,$inout1
+	 vmovups	$inout2,-0x40($out)
+	 vpshufb	$Ii,$inout2,$inout2
+	 vmovups	$inout3,-0x30($out)
+	 vpshufb	$Ii,$inout3,$inout3
+	 vmovups	$inout4,-0x20($out)
+	 vpshufb	$Ii,$inout4,$inout4
+	 vmovups	$inout5,-0x10($out)
+	 vpshufb	$Ii,$inout5,$inout5
+	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+	 vmovdqu	0x30(%rsp),$Z2		# I[4]
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	 vpunpckhqdq	$Z2,$Z2,$T2
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
+	 vpxor		$Z2,$T2,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+
+	 vmovdqu	0x40(%rsp),$T3		# I[3]
+	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$T3,$T3,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
+	 vpxor		$T3,$Z1,$Z1
+	vpxor		$Z3,$Z2,$Z2
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	 vmovdqu	0x50(%rsp),$T1		# I[2]
+	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z0,$Z3,$Z3
+	 vpunpckhqdq	$T1,$T1,$Z0
+	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
+	 vpxor		$T1,$Z0,$Z0
+	vpxor		$Z2,$T3,$T3
+	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
+	vpxor		$T2,$Z1,$Z1
+
+	 vmovdqu	0x60(%rsp),$T2		# I[1]
+	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z3,$Z2,$Z2
+	 vpunpckhqdq	$T2,$T2,$Z3
+	vpclmulqdq	\$0x11,$Ii,$T1,$T1
+	 vpxor		$T2,$Z3,$Z3
+	vpxor		$T3,$T1,$T1
+	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$Z1,$Z0,$Z0
+
+	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
+	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpxor		$Z2,$Z1,$Z1
+	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$T1,$T2,$T2
+	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
+	vpxor		$Z0,$Z3,$Z0
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
+	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$inout5,$inout5,$T1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
+	 vpxor		$inout5,$T1,$T1
+	vpxor		$Z1,$Z2,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$T3
+	 vmovdqu	0x20-0x20($Xip),$HK
+	vpxor		$T2,$Xi,$Z3
+	vpxor		$Z0,$T3,$Z2
+
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
+	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
+	  vpxor		$T3,$Z2,$Z2
+	 vpunpckhqdq	$inout4,$inout4,$T2
+	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
+	 vpxor		$inout4,$T2,$T2
+	  vpslldq	\$8,$Z2,$T3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+	  vpxor		$T3,$Z1,$Xi
+	  vpsrldq	\$8,$Z2,$Z2
+	  vpxor		$Z2,$Z3,$Z3
+
+	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout3,$inout3,$T3
+	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
+	 vpxor		$inout3,$T3,$T3
+	vpxor		$inout5,$inout4,$inout4
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$inout2,$inout2,$T1
+	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
+	 vpxor		$inout2,$T1,$T1
+	vpxor		$inout4,$inout3,$inout3
+	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
+	vpclmulqdq	\$0x00,$HK,$T3,$T3
+	vpxor		$T2,$T3,$T3
+
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout1,$inout1,$T2
+	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
+	 vpxor		$inout1,$T2,$T2
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
+	vpxor		$inout3,$inout2,$inout2
+	vpclmulqdq	\$0x10,$HK,$T1,$T1
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$T3,$T1,$T1
+
+	  vxorps	$Z3,$inout5,$inout5
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$inout2,$inout1,$inout1
+	vpclmulqdq	\$0x00,$HK,$T2,$T2
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
+	vpxor		$Z0,$Z1,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$Z2
+	vpxor		$inout1,$Z3,$Z3
+	vpxor		$T2,$Z2,$Z2
+
+	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
+	vpxor		$Z0,$Z2,$Z2
+	vpslldq		\$8,$Z2,$T1
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+	vpsrldq		\$8,$Z2,$Z2
+	vpxor		$T1,$Z1,$Xi
+	vpxor		$Z2,$Z3,$Z3
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$Z3,$T2,$T2
+	vpxor		$T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xc8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_enc_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern	__imp_RtlVirtualUnwind
+.type	gcm_se_handler,\@abi-omnipotent
+.align	16
+gcm_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	120($context),%rax	# pull context->Rax
+
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	mov	%r15,240($context)
+	mov	%r14,232($context)
+	mov	%r13,224($context)
+	mov	%r12,216($context)
+	mov	%rbp,160($context)
+	mov	%rbx,144($context)
+
+	lea	-0xd8(%rax),%rsi	# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	gcm_se_handler,.-gcm_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_gcm_decrypt
+	.rva	.LSEH_end_aesni_gcm_decrypt
+	.rva	.LSEH_gcm_dec_info
+
+	.rva	.LSEH_begin_aesni_gcm_encrypt
+	.rva	.LSEH_end_aesni_gcm_encrypt
+	.rva	.LSEH_gcm_enc_info
+.section	.xdata
+.align	8
+.LSEH_gcm_dec_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___;	# assembler is too old
+.text
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl
index d91586ee2..77fbf3446 100644
--- a/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
 # Add NEON implementation featuring polynomial multiplication, i.e. no
 # lookup tables involved. On Cortex A8 it was measured to process one
 # byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# C�mara, D.; Gouv�a, C. P. L.; L�pez, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 # ====================================================================
 # Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,161 @@ $code.=<<___;
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
 {
-my $cnt=$Htbl;	# $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
 
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
 
 $code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
 .fpu	neon
 
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1,:64]!	@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1,:64]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	ret					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
 .global	gcm_gmult_neon
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
-	sub		$Htbl,#16		@ point at H in GCM128_CTX
-	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
-	veor		$zero,$zero
+	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$IN#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
 #ifdef __ARMEL__
 	vrev64.8	$IN,$IN
 #endif
-	veor		$Qpost,$Qpost
-	veor		$R,$R
-	mov		$cnt,#16
-	veor		$Z,$Z
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
 	mov		$len,#16
-	veor		$Zo,$Zo
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	b		.Linner_neon
+	b		.Lgmult_neon
 .size	gcm_gmult_neon,.-gcm_gmult_neon
 
 .global	gcm_ghash_neon
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
-	veor		$zero,$zero
-	nop
+	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$Xl#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
 #ifdef __ARMEL__
-	vrev64.8	$Z,$Z
+	vrev64.8	$Xl,$Xl
 #endif
-.Louter_neon:
-	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
-	veor		$Qpost,$Qpost
-	vld1.64		`&Dlo($IN)`,[$inp]!
-	veor		$R,$R
-	mov		$cnt,#16
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
 #ifdef __ARMEL__
 	vrev64.8	$IN,$IN
 #endif
-	veor		$Zo,$Zo
-	veor		$IN,$Z			@ inp^=Xi
-	veor		$Z,$Z
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-.Linner_neon:
-	subs		$cnt,$cnt,#1
-	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo�Xi[i]
-	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi�Xi[i]
-	vext.8		$IN,$zero,#1		@ IN>>=8
-
-	veor		$Z,$Qpost		@ modulo-scheduled part
-	vshl.i64	`&Dlo("$R")`,#48
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-	vuzp.8		$Qlo,$Qhi
-	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
-	vext.8		$Z,$zero,#1		@ Z>>=8
-
-	vmull.p8	$R,$Zo,$mod		@ "carry"�0xe1
-	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
-	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
-	veor		$Z,$Qhi
-	bne		.Linner_neon
-
-	veor		$Z,$Qpost		@ modulo-scheduled artefact
-	vshl.i64	`&Dlo("$R")`,#48
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-
-	@ finalization, normalize Z:Zo
-	vand		$Zo,$mod		@ suffices to mask the bit
-	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
-	vshl.i64	$Z,#1
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo�Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi�Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
 	subs		$len,#16
-	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
-	bne		.Louter_neon
+	bne		.Loop_neon
 
 #ifdef __ARMEL__
-	vrev64.8	$Z,$Z
+	vrev64.8	$Xl,$Xl
 #endif
 	sub		$Xi,#16	
-	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
-	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
+	vst1.64		$Xl#lo,[$Xi,:64]
 
-	bx	lr
+	ret					@ bx lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
@@ -423,7 +481,13 @@ $code.=<<___;
 .align  2
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
 close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl
index 6a40d5d89..39096b423 100644
--- a/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -186,13 +186,13 @@ $code.=<<___;
 	sllg	$rem1,$Zlo,3
 	xgr	$Zlo,$tmp
 	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
 	j	.Lghash_inner
 .align	16
 .Lghash_inner:
 	srlg	$Zlo,$Zlo,4
-	sllg	$tmp,$Zhi,60
-	xg	$Zlo,8($nlo,$Htbl)
 	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
 	llgc	$xi,0($cnt,$Xi)
 	xg	$Zhi,0($nlo,$Htbl)
 	sllg	$nlo,$xi,4
@@ -213,9 +213,9 @@ $code.=<<___;
 	sllg	$rem1,$Zlo,3
 	xgr	$Zlo,$tmp
 	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
 	brct	$cnt,.Lghash_inner
 
-	sllg	$tmp,$Zhi,60
 	srlg	$Zlo,$Zlo,4
 	srlg	$Zhi,$Zhi,4
 	xg	$Zlo,8($nlo,$Htbl)
diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl
index 70e7b044a..0365e0f1f 100644
--- a/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -36,6 +36,15 @@
 # references to input data and Z.hi updates to achieve 12 cycles
 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
 
 $bits=32;
 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -66,6 +75,10 @@ $Htbl="%i1";
 $inp="%i2";
 $len="%i3";
 
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
 $code.=<<___;
 .section	".text",#alloc,#execinstr
 
@@ -321,10 +334,238 @@ gcm_gmult_4bit:
 	restore
 .type	gcm_gmult_4bit,#function
 .size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl	gcm_init_vis3
+.align	32
+gcm_init_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[%i1+0],$Hhi
+	ldx	[%i1+8],$Hlo
+	mov	0xE1,$Xhi
+	mov	1,$Xlo
+	sllx	$Xhi,57,$Xhi
+	srax	$Hhi,63,$C0		! broadcast carry
+	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
+	addxc	$Hhi,$Hhi,$Hhi
+	and	$C0,$Xlo,$Xlo
+	and	$C0,$Xhi,$Xhi
+	xor	$Xlo,$Hlo,$Hlo
+	xor	$Xhi,$Hhi,$Hhi
+	stx	$Hlo,[%i0+8]		! save twisted H
+	stx	$Hhi,[%i0+0]
+
+	sethi	%hi(0xA0406080),$V
+	sethi	%hi(0x20C0E000),%l0
+	or	$V,%lo(0xA0406080),$V
+	or	%l0,%lo(0x20C0E000),%l0
+	sllx	$V,32,$V
+	or	%l0,$V,$V		! (0xE0�i)&0xff=0xA040608020C0E000
+	stx	$V,[%i0+16]
+
+	ret
+	restore
+.type	gcm_init_vis3,#function
+.size	gcm_init_vis3,.-gcm_init_vis3
+
+.globl	gcm_gmult_vis3
+.align	32
+gcm_gmult_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[$Xip+8],$Xlo		! load Xi
+	ldx	[$Xip+0],$Xhi
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0�i)&0xff=0xA040608020C0E000
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! �0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0�0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! �0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_gmult_vis3,#function
+.size	gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl	gcm_ghash_vis3
+.align	32
+gcm_ghash_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[$Xip+8],$C2		! load Xi
+	ldx	[$Xip+0],$C3
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0�i)&0xff=0xA040608020C0E000
+
+	and	$inp,7,$shl
+	andn	$inp,7,$inp
+	sll	$shl,3,$shl
+	prefetch [$inp+63], 20
+	sub	%g0,$shl,$shr
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+.Loop:
+	ldx	[$inp+8],$Xlo
+	brz,pt	$shl,1f
+	ldx	[$inp+0],$Xhi
+
+	ldx	[$inp+16],$C1		! align data
+	srlx	$Xlo,$shr,$C0
+	sllx	$Xlo,$shl,$Xlo
+	sllx	$Xhi,$shl,$Xhi
+	srlx	$C1,$shr,$C1
+	or	$C0,$Xhi,$Xhi
+	or	$C1,$Xlo,$Xlo
+1:
+	add	$inp,16,$inp
+	sub	$len,16,$len
+	xor	$C2,$Xlo,$Xlo
+	xor	$C3,$Xhi,$Xhi
+	prefetch [$inp+63], 20
+
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! �0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0�0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! �0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	brnz,pt	$len,.Loop
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_ghash_vis3,#function
+.size	gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 .align	4
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"xmulx"		=> 0x115,
+		"xmulxhi"	=> 0x116	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
 close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl
index 83c727e07..23a5527b3 100644
--- a/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/openssl/crypto/modes/asm/ghash-x86.pl
@@ -12,25 +12,27 @@
 # The module implements "4-bit" GCM GHASH function and underlying
 # single multiplication operation in GF(2^128). "4-bit" means that it
 # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
 # "528B" variant of "4-bit" method utilizing additional 256+16 bytes
 # of per-key storage [+512 bytes shared table]. Performance results
 # are for streamed GHASH subroutine and are expressed in cycles per
 # processed byte, less is better:
 #
-#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#		gcc 2.95.3(*)	SSE assembler	x86 assembler
 #
 # Pentium	105/111(**)	-		50
 # PIII		68 /75		12.2		24
 # P4		125/125		17.8		84(***)
 # Opteron	66 /70		10.1		30
 # Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
 #	which is one of reasons why 2.95.3 results were chosen,
 #	another reason is lack of 3.4.x results for older CPUs;
-#	comparison with MMX results is not completely fair, because C
+#	comparison with SSE results is not completely fair, because C
 #	results are for vanilla "256B" implementation, while
 #	assembler results are for "528B";-)
 # (**)	second number is result for code compiled with -fPIC flag,
@@ -40,8 +42,8 @@
 #
 # To summarize, it's >2-5 times faster than gcc-generated code. To
 # anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
 
 # May 2010
 #
@@ -113,6 +115,16 @@
 # similar manner resulted in almost 20% degradation on Sandy Bridge,
 # where original 64-bit code processes one byte in 1.95 cycles.
 
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -822,17 +834,18 @@ $len="ebx";
 &static_label("bswap");
 
 sub clmul64x64_T2 {	# minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
 	&movdqa		($Xhi,$Xi);		#
 	&pshufd		($T1,$Xi,0b01001110);
-	&pshufd		($T2,$Hkey,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
 	&pxor		($T1,$Xi);		#
-	&pxor		($T2,$Hkey);
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
 
 	&pclmulqdq	($Xi,$Hkey,0x00);	#######
 	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&pclmulqdq	($T1,$T2,0x00);		#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
 	&xorps		($T1,$Xi);		#
 	&xorps		($T1,$Xhi);		#
 
@@ -879,31 +892,32 @@ if (1) {		# Algorithm 9 with <<1 twist.
 			# below. Algorithm 9 was therefore chosen for
 			# further optimization...
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 	# 1st phase
-	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
 	&psllq		($Xi,1);
 	&pxor		($Xi,$T1);		#
-	&psllq		($Xi,5);		#
-	&pxor		($Xi,$T1);		#
 	&psllq		($Xi,57);		#
-	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);		#
 	&pslldq		($Xi,8);
-	&psrldq		($T2,8);		#
-	&pxor		($Xi,$T1);
-	&pxor		($Xhi,$T2);		#
+	&psrldq		($T1,8);		#	
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
 
 	# 2nd phase
 	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
 	&psrlq		($Xi,5);
 	&pxor		($Xi,$T2);		#
 	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
-	&pxor		($T2,$Xhi);
-	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
+	&pxor		($Xi,$Xhi)		#
 }
 
 &function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
 	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
 	&reduction_alg9	($Xhi,$Xi);
 
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
 	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
 	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
 
 	&ret		();
 &function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
 	&movdqa		($T3,&QWP(0,$const));
 	&movups		($Hkey,&QWP(0,$Htbl));
 	&pshufb		($Xi,$T3);
+	&movups		($T2,&QWP(32,$Htbl));
 
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
 	&reduction_alg9	($Xhi,$Xi);
 
 	&pshufb		($Xi,$T3);
@@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;
 	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
 	&pshufb		($T1,$T3);
 	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
 	&pxor		($Xi,$T1);		# Ii+Xi
 
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T3,0x00);		#######
 	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
 
-	&lea		($inp,&DWP(32,$inp));	# i+=2
 	&sub		($len,0x20);
 	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
 
-&set_label("mod_loop");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+	&nop		();
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
 
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
 
-	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
-	&movdqa		($Xhn,$Xn);
-	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
 
-	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
 	  &psllq	($Xi,1);
 	  &pxor		($Xi,$T1);		#
-	  &psllq	($Xi,5);		#
-	  &pxor		($Xi,$T1);		#
 	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
 	  &psllq	($Xi,57);		#
-	  &movdqa	($T2,$Xi);		#
+	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
-	  &psrldq	($T2,8);		#	
-	  &pxor		($Xi,$T1);
-	&pshufd		($T1,$T3,0b01001110);
+	  &psrldq	($T1,8);		#	
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
 	  &pxor		($Xhi,$T2);		#
-	&pxor		($T1,$T3);
-	&pshufd		($T3,$Hkey,0b01001110);
-	&pxor		($T3,$Hkey);		#
-
 	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
-	  &movdqa	($T2,$Xi);		# 2nd phase
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($T2,$Xi);
 	  &psrlq	($Xi,5);
 	  &pxor		($Xi,$T2);		#
 	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-	  &pxor		($T2,$Xhi);
-	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-
+	  &pxor		($Xi,$Xhi)		#
 	&pclmulqdq	($T1,$T3,0x00);		#######
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-	&xorps		($T1,$Xn);		#
-	&xorps		($T1,$Xhn);		#
-
-	&movdqa		($T3,$T1);		#
-	&psrldq		($T1,8);
-	&pslldq		($T3,8);		#
-	&pxor		($Xhn,$T1);
-	&pxor		($Xn,$T3);		#
-	&movdqa		($T3,&QWP(0,$const));
 
 	&lea		($inp,&DWP(32,$inp));
 	&sub		($len,0x20);
 	&ja		(&label("mod_loop"));
 
 &set_label("even_tail");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
 
 	&reduction_alg9	($Xhi,$Xi);
 
@@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;
 &set_label("bswap",64);
 	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
-}}	# $sse2
-
-&set_label("rem_4bit",64);
-	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
-	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
-	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
-	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 &set_label("rem_8bit",64);
 	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
 	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;
 	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
 	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
 	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 }}}	# !$x86only
 
 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl
index 38d779edb..6e656ca13 100644
--- a/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
 # P4		28.6		14.0		+100%
 # Opteron	19.3		7.7		+150%
 # Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
 #
 # (*)	comparison is not completely fair, because C results are
 #	for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,44 @@
 # providing access to a Westmere-based system on behalf of Intel
 # Open Source Technology Centre.
 
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
+$do4xaggr=1;
+
 # common register layout
 $nlo="%rax";
 $nhi="%rbx";
@@ -160,6 +221,7 @@ ___
 
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	gcm_gmult_4bit
 .type	gcm_gmult_4bit,\@function,2
@@ -352,19 +414,27 @@ ___
 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
 
 sub clmul64x64_T2 {	# minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
 	movdqa		$Xi,$Xhi		#
 	pshufd		\$0b01001110,$Xi,$T1
 	pshufd		\$0b01001110,$Hkey,$T2
 	pxor		$Xi,$T1			#
 	pxor		$Hkey,$T2
 ___
+} else {
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1			#
+___
+}
 $code.=<<___;
 	pclmulqdq	\$0x00,$Hkey,$Xi	#######
 	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
-	pclmulqdq	\$0x00,$T2,$T1		#######
+	pclmulqdq	\$0x00,$HK,$T1		#######
 	pxor		$Xi,$T1			#
 	pxor		$Xhi,$T1		#
 
@@ -376,42 +446,53 @@ $code.=<<___;
 ___
 }
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 $code.=<<___;
 	# 1st phase
-	movdqa		$Xi,$T1			#
+	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1
+	psllq		\$5,$Xi
+	pxor		$Xi,$T1			#
 	psllq		\$1,$Xi
 	pxor		$T1,$Xi			#
-	psllq		\$5,$Xi			#
-	pxor		$T1,$Xi			#
 	psllq		\$57,$Xi		#
-	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1			#
 	pslldq		\$8,$Xi
-	psrldq		\$8,$T2			#	
-	pxor		$T1,$Xi
-	pxor		$T2,$Xhi		#
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
 
 	# 2nd phase
 	movdqa		$Xi,$T2
+	psrlq		\$1,$Xi
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
 	psrlq		\$5,$Xi
 	pxor		$T2,$Xi			#
 	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
-	pxor		$Xhi,$T2
-	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
+	pxor		$Xhi,$Xi		#
 ___
 }
 
 { my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
 
 $code.=<<___;
 .globl	gcm_init_clmul
 .type	gcm_init_clmul,\@abi-omnipotent
 .align	16
 gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
 	movdqu		($Xip),$Hkey
 	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
 
@@ -430,13 +511,47 @@ gcm_init_clmul:
 	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
 
 	# calculate H^2
+	pshufd		\$0b01001110,$Hkey,$HK
 	movdqa		$Hkey,$Xi
+	pxor		$Hkey,$HK
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
-	movdqu		$Hkey,($Htbl)		# save H
-	movdqu		$Xi,16($Htbl)		# save H^2
+	pshufd		\$0b01001110,$Hkey,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$Hkey,$T1		# Karatsuba pre-processing
+	movdqu		$Hkey,0x00($Htbl)	# save H
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x10($Htbl)		# save H^2
+	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
+	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
+___
+if ($do4xaggr) {
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqa		$Xi,$T3
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$T3,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$T3,$T1			# Karatsuba pre-processing
+	movdqu		$T3,0x30($Htbl)		# save H^3
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x40($Htbl)		# save H^4
+	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
+	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
 	ret
 .size	gcm_init_clmul,.-gcm_init_clmul
 ___
@@ -449,13 +564,38 @@ $code.=<<___;
 .type	gcm_gmult_clmul,\@abi-omnipotent
 .align	16
 gcm_gmult_clmul:
+.L_gmult_clmul:
 	movdqu		($Xip),$Xi
 	movdqa		.Lbswap_mask(%rip),$T3
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$T2
 	pshufb		$T3,$Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+	# experimental alternative. special thing about is that there
+	# no dependency between the two multiplications... 
+	mov		\$`0xE1<<1`,%eax
+	mov		\$0xA040608020C0E000,%r10	# ((7..0)�0xE0)&0xff
+	mov		\$0x07,%r11d
+	movq		%rax,$T1
+	movq		%r10,$T2
+	movq		%r11,$T3		# borrow $T3
+	pand		$Xi,$T3
+	pshufb		$T3,$T2			# ($Xi&7)�0xE0
+	movq		%rax,$T3
+	pclmulqdq	\$0x00,$Xi,$T1		# �(0xE1<<1)
+	pxor		$Xi,$T2
+	pslldq		\$15,$T2
+	paddd		$T2,$T2			# <<(64+56+1)
+	pxor		$T2,$Xi
+	pclmulqdq	\$0x01,$T3,$Xi
+	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
+	psrldq		\$1,$T1
+	pxor		$T1,$Xhi
+	pslldq		\$7,$Xi
+	pxor		$Xhi,$Xi
+___
 $code.=<<___;
 	pshufb		$T3,$Xi
 	movdqu		$Xi,($Xip)
@@ -465,129 +605,327 @@ ___
 }
 
 { my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my $Xn="%xmm6";
-  my $Xhn="%xmm7";
-  my $Hkey2="%xmm8";
-  my $T1n="%xmm9";
-  my $T2n="%xmm10";
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
 
 $code.=<<___;
 .globl	gcm_ghash_clmul
 .type	gcm_ghash_clmul,\@abi-omnipotent
-.align	16
+.align	32
 gcm_ghash_clmul:
+.L_ghash_clmul:
 ___
 $code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
 .LSEH_begin_gcm_ghash_clmul:
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
-	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
-	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
-	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
-	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
-	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
 ___
 $code.=<<___;
 	movdqa		.Lbswap_mask(%rip),$T3
 
 	movdqu		($Xip),$Xi
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$HK
 	pshufb		$T3,$Xi
 
 	sub		\$0x10,$len
 	jz		.Lodd_tail
 
-	movdqu		16($Htbl),$Hkey2
+	movdqu		0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+	mov		OPENSSL_ia32cap_P+4(%rip),%eax
+	cmp		\$0x30,$len
+	jb		.Lskip4x
+
+	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
+	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
+	je		.Lskip4x
+
+	sub		\$0x30,$len
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)�0xE0)&0xff
+	movdqu		0x30($Htbl),$Hkey3
+	movdqu		0x40($Htbl),$Hkey4
+
+	#######
+	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+	#
+	movdqu		0x30($inp),$Xln
+	 movdqu		0x20($inp),$Xl
+	pshufb		$T3,$Xln
+	 pshufb		$T3,$Xl
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey2,$Xl
+	pclmulqdq	\$0x11,$Hkey2,$Xh
+	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+	movups		0x50($Htbl),$HK
+	xorps		$Xm,$Xmn
+
+	movdqu		0x10($inp),$Xl
+	 movdqu		0($inp),$T1
+	pshufb		$T3,$Xl
+	 pshufb		$T3,$T1
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$T1,$Xi
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey3,$Xl
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pxor		$Xi,$T1
+	pclmulqdq	\$0x11,$Hkey3,$Xh
+	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	 movdqu		0x30($inp),$Xl
+	 pshufb		$T3,$Xl
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	 movdqu		0x20($inp),$Xln
+	 movdqa		$Xl,$Xh
+	pclmulqdq	\$0x10,$HK,$T1
+	 pshufd		\$0b01001110,$Xl,$Xm
+	xorps		$Xhn,$Xhi
+	 pxor		$Xl,$Xm
+	 pshufb		$T3,$Xln
+	movups		0x20($Htbl),$HK
+	xorps		$Xmn,$T1
+	 pclmulqdq	\$0x00,$Hkey,$Xl
+	 pshufd		\$0b01001110,$Xln,$Xmn
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	 movdqa		$Xln,$Xhn
+	pxor		$Xhi,$T1		#
+	 pxor		$Xln,$Xmn
+	movdqa		$T1,$T2			#
+	 pclmulqdq	\$0x11,$Hkey,$Xh
+	pslldq		\$8,$T1
+	psrldq		\$8,$T2			#
+	pxor		$T1,$Xi
+	movdqa		.L7_mask(%rip),$T1
+	pxor		$T2,$Xhi		#
+	movq		%rax,$T2
+
+	pand		$Xi,$T1			# 1st phase
+	pshufb		$T1,$T2			#
+	pxor		$Xi,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
+	psllq		\$57,$T2		#
+	movdqa		$T2,$T1			#
+	pslldq		\$8,$T2
+	 pclmulqdq	\$0x00,$Hkey2,$Xln
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+	movdqu		0($inp),$T1
+
+	movdqa		$Xi,$T2			# 2nd phase
+	psrlq		\$1,$Xi
+	 pclmulqdq	\$0x11,$Hkey2,$Xhn
+	 xorps		$Xl,$Xln
+	 movdqu		0x10($inp),$Xl
+	 pshufb		$T3,$Xl
+	 pclmulqdq	\$0x10,$HK,$Xmn
+	 xorps		$Xh,$Xhn
+	 movups		0x50($Htbl),$HK
+	pshufb		$T3,$T1
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+
+	 movdqa		$Xl,$Xh
+	 pxor		$Xm,$Xmn
+	 pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$T2,$Xi			#
+	pxor		$T1,$Xhi
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+	movdqa		$Xi,$Xhi
+	 pclmulqdq	\$0x11,$Hkey3,$Xh
+	 xorps		$Xl,$Xln
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
+
+	 pclmulqdq	\$0x00,$HK,$Xm
+	 xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xm,$Xmn
+	xorps		$Xln,$Xi
+	xorps		$Xhn,$Xhi
+	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
+	pxor		$Xmn,$T1
+
+	pxor		$Xhi,$T1		#
+	pxor		$Xi,$Xhi
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+	&reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+	add	\$0x40,$len
+	jz	.Ldone
+	movdqu	0x20($Htbl),$HK
+	sub	\$0x10,$len
+	jz	.Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
 	#######
 	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 	#	[(H*Ii+1) + (H*Xi+1)] mod P =
 	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
 	#
 	movdqu		($inp),$T1		# Ii
-	movdqu		16($inp),$Xn		# Ii+1
+	movdqu		16($inp),$Xln		# Ii+1
 	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pshufb		$T3,$Xln
 	pxor		$T1,$Xi			# Ii+Xi
-___
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
-$code.=<<___;
-	movdqa		$Xi,$Xhi		#
-	pshufd		\$0b01001110,$Xi,$T1
-	pshufd		\$0b01001110,$Hkey2,$T2
-	pxor		$Xi,$T1			#
-	pxor		$Hkey2,$T2
+
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
 
 	lea		32($inp),$inp		# i+=2
+	nop
 	sub		\$0x20,$len
 	jbe		.Leven_tail
+	nop
+	jmp		.Lmod_loop
 
+.align	32
 .Lmod_loop:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	movdqu		($inp),$T1		# Ii
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
-	pxor		$Xhn,$Xhi
+	movdqa		$Xi,$Xhi
+	movdqa		$Xmn,$T1
+	pshufd		\$0b01001110,$Xi,$Xmn	#
+	pxor		$Xi,$Xmn		#
 
-	movdqu		16($inp),$Xn		# Ii+1
-	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
 
-	movdqa		$Xn,$Xhn		#
-	pshufd		\$0b01001110,$Xn,$T1n
-	pshufd		\$0b01001110,$Hkey,$T2n
-	pxor		$Xn,$T1n		#
-	pxor		$Hkey,$T2n
-	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	  movdqu	($inp),$T2		# Ii
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	  pshufb	$T3,$T2
+	  movdqu	16($inp),$Xln		# Ii+1
+
+	pxor		$Xhi,$T1
+	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
+	pxor		$T1,$Xmn
+	 pshufb		$T3,$Xln
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+
+	movdqa		$Xln,$Xhn		#
 
-	  movdqa	$Xi,$T1			# 1st phase
+	  movdqa	$Xi,$T2			# 1st phase
+	  movdqa	$Xi,$T1
+	  psllq		\$5,$Xi
+	  pxor		$Xi,$T1			#
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
 	  psllq		\$1,$Xi
 	  pxor		$T1,$Xi			#
-	  psllq		\$5,$Xi			#
-	  pxor		$T1,$Xi			#
-	pclmulqdq	\$0x00,$Hkey,$Xn	#######
 	  psllq		\$57,$Xi		#
-	  movdqa	$Xi,$T2			#
+	  movdqa	$Xi,$T1			#
 	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T2			#	
-	  pxor		$T1,$Xi
-	  pxor		$T2,$Xhi		#
+	  psrldq	\$8,$T1			#	
+	  pxor		$T2,$Xi
+	pshufd		\$0b01001110,$Xhn,$Xmn
+	  pxor		$T1,$Xhi		#
+	pxor		$Xhn,$Xmn		#
 
-	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$1,$Xi
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  pxor		$T2,$Xhi		#
+	  pxor		$Xi,$T2
 	  psrlq		\$5,$Xi
 	  pxor		$T2,$Xi			#
+	lea		32($inp),$inp
 	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-	  pxor		$Xhi,$T2
-	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
+	pclmulqdq	\$0x00,$HK,$Xmn		#######
+	  pxor		$Xhi,$Xi		#
 
-	pclmulqdq	\$0x00,$T2n,$T1n	#######
-	 movdqa		$Xi,$Xhi		#
-	 pshufd		\$0b01001110,$Xi,$T1
-	 pshufd		\$0b01001110,$Hkey2,$T2
-	 pxor		$Xi,$T1			#
-	 pxor		$Hkey2,$T2
-
-	pxor		$Xn,$T1n		#
-	pxor		$Xhn,$T1n		#
-	movdqa		$T1n,$T2n		#
-	psrldq		\$8,$T1n
-	pslldq		\$8,$T2n		#
-	pxor		$T1n,$Xhn
-	pxor		$T2n,$Xn		#
-
-	lea		32($inp),$inp
 	sub		\$0x20,$len
 	ja		.Lmod_loop
 
 .Leven_tail:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	 movdqa		$Xi,$Xhi
+	 movdqa		$Xmn,$T1
+	 pshufd		\$0b01001110,$Xi,$Xmn	#
+	 pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
+	pxor		$Xi,$T1
+	pxor		$Xhi,$T1
+	pxor		$T1,$Xmn
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
 ___
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
@@ -599,7 +937,7 @@ $code.=<<___;
 	pshufb		$T3,$T1
 	pxor		$T1,$Xi			# Ii+Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
 .Ldone:
@@ -612,21 +950,607 @@ $code.=<<___ if ($win64);
 	movaps	0x20(%rsp),%xmm8
 	movaps	0x30(%rsp),%xmm9
 	movaps	0x40(%rsp),%xmm10
-	add	\$0x58,%rsp
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
 ___
 $code.=<<___;
 	ret
-.LSEH_end_gcm_ghash_clmul:
 .size	gcm_ghash_clmul,.-gcm_ghash_clmul
 ___
 }
+
+$code.=<<___;
+.globl	gcm_init_avx
+.type	gcm_init_avx,\@abi-omnipotent
+.align	32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Hkey
+	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	vpsrlq		\$63,$Hkey,$T1
+	vpsllq		\$1,$Hkey,$Hkey
+	vpxor		$T3,$T3,$T3		#
+	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
+	vpslldq		\$8,$T1,$T1
+	vpor		$T1,$Hkey,$Hkey		# H<<=1
+
+	# magic reduction
+	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
+	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	vpunpckhqdq	$Hkey,$Hkey,$HK
+	vmovdqa		$Hkey,$Xi
+	vpxor		$Hkey,$HK,$HK
+	mov		\$4,%r10		# up to H^8
+	jmp		.Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpunpckhqdq	$Hkey,$Hkey,$T2
+	vpxor		$Xi,$T1,$T1		#
+	vpxor		$Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpxor		$Xi,$T1,$T1		#
+___
+}
+$code.=<<___;
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
+	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
+	vpxor		$Xi,$Xhi,$T2		#
+	vpxor		$T2,$T1,$T1		#
+
+	vpslldq		\$8,$T1,$T2		#
+	vpsrldq		\$8,$T1,$T1
+	vpxor		$T2,$Xi,$Xi		#
+	vpxor		$T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	vpsllq		\$57,$Xi,$T1		# 1st phase
+	vpsllq		\$62,$Xi,$T2
+	vpxor		$T1,$T2,$T2		#
+	vpsllq		\$63,$Xi,$T1
+	vpxor		$T1,$T2,$T2		#
+	vpslldq		\$8,$T2,$T1		#
+	vpsrldq		\$8,$T2,$T2
+	vpxor		$T1,$Xi,$Xi		#
+	vpxor		$T2,$Xhi,$Xhi
+
+	vpsrlq		\$1,$Xi,$T2		# 2nd phase
+	vpxor		$Xi,$Xhi,$Xhi
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$5,$T2,$T2
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$1,$Xi,$Xi		#
+	vpxor		$Xhi,$Xi,$Xi		#
+___
+}
 
 $code.=<<___;
+.align	32
+.Linit_loop_avx:
+	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
+	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+	vmovdqa		$Xi,$T3
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+	vpshufd		\$0b01001110,$T3,$T1
+	vpshufd		\$0b01001110,$Xi,$T2
+	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
+	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
+	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
+	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
+	lea		0x30($Htbl),$Htbl
+	sub		\$1,%r10
+	jnz		.Linit_loop_avx
+
+	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
+	vmovdqu		$T3,-0x10($Htbl)
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_init_clmul
+.size	gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,\@abi-omnipotent
+.align	32
+gcm_gmult_avx:
+	jmp	.L_gmult_clmul
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,\@abi-omnipotent
+.align	32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Xi		# load $Xi
+	lea		.L0x1c2_polynomial(%rip),%r10
+	lea		0x40($Htbl),$Htbl	# size optimization
+	vmovdqu		.Lbswap_mask(%rip),$bswap
+	vpshufb		$bswap,$Xi,$Xi
+	cmp		\$0x80,$len
+	jb		.Lshort_avx
+	sub		\$0x80,$len
+
+	vmovdqu		0x70($inp),$Ii		# I[7]
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vpshufb		$bswap,$Ii,$Ii
+	vmovdqu		0x20-0x40($Htbl),$HK
+
+	vpunpckhqdq	$Ii,$Ii,$T2
+	 vmovdqu	0x60($inp),$Ij		# I[6]
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Ii,$T2,$T2
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x50($inp),$Ii		# I[5]
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpxor		$Ii,$T2,$T2
+	 vmovdqu	0x40($inp),$Ij		# I[4]
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x50-0x40($Htbl),$HK
+
+	 vpshufb	$bswap,$Ij,$Ij
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x30($inp),$Ii		# I[3]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	0x20($inp),$Ij		# I[2]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	vpxor		$Xmi,$Zmi,$Zmi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x10($inp),$Ii		# I[1]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	($inp),$Ij		# I[0]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
+
+	lea		0x80($inp),$inp
+	cmp		\$0x80,$len
+	jb		.Ltail_avx
+
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+	sub		\$0x80,$len
+	jmp		.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x70($inp),$Ii		# I[7]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpxor		$Ij,$T1,$T1
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
+	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Tred
+	 vmovdqu	0x20-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	  vmovdqu	0x60($inp),$Ij		# I[6]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Zlo,$Xi,$Xi		# collect result
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	vxorps		$Zhi,$Xo,$Xo
+	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	vpxor		$Zmi,$Tred,$Tred
+	 vxorps		$Ij,$T1,$T1
+
+	  vmovdqu	0x50($inp),$Ii		# I[5]
+	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Xo,$Tred,$Tred
+	vpslldq		\$8,$Tred,$T2
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	vpsrldq		\$8,$Tred,$Tred
+	vpxor		$T2, $Xi, $Xi
+	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	  vpshufb	$bswap,$Ii,$Ii
+	vxorps		$Tred,$Xo, $Xo
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x50-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x40($inp),$Ij		# I[4]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vxorps		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+
+	  vmovdqu	0x30($inp),$Ii		# I[3]
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x20($inp),$Ij		# I[2]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+	vxorps		$Tred,$Xi,$Xi
+
+	  vmovdqu	0x10($inp),$Ii		# I[1]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	vxorps		$Xo,$Tred,$Tred
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	($inp),$Ij		# I[0]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Tred,$Ij,$Ij
+	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+
+	lea		0x80($inp),$inp
+	sub		\$0x80,$len
+	jnc		.Loop8x_avx
+
+	add		\$0x80,$len
+	jmp		.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu		-0x10($inp,$len),$Ii	# very last word
+	lea		($inp,$len),$inp
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vmovdqu		0x20-0x40($Htbl),$HK
+	vpshufb		$bswap,$Ii,$Ij
+
+	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
+	vmovdqa		$Xhi,$Zhi		# $Zhi and
+	vmovdqa		$Xmi,$Zmi		# $Zmi
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x20($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x30($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x50-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x40($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x50($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x80-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x60($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x70($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovq		0xb8-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jmp		.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+.Ltail_no_xor_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+
+	vmovdqu		(%r10),$Tred
+
+	vpxor		$Xlo,$Zlo,$Xi
+	vpxor		$Xhi,$Zhi,$Xo
+	vpxor		$Xmi,$Zmi,$Zmi
+
+	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
+	vpxor		$Xo, $Zmi,$Zmi
+	vpslldq		\$8, $Zmi,$T2
+	vpsrldq		\$8, $Zmi,$Zmi
+	vpxor		$T2, $Xi, $Xi
+	vpxor		$Zmi,$Xo, $Xo
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$Xo,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	cmp		\$0,$len
+	jne		.Lshort_avx
+
+	vpshufb		$bswap,$Xi,$Xi
+	vmovdqu		$Xi,($Xip)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_ghash_clmul
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
 .align	64
 .Lbswap_mask:
 	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .L0x1c2_polynomial:
 	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+	.long	7,0,7,0
+.L7_mask_poly:
+	.long	7,0,`0xE1<<1`,0
 .align	64
 .type	.Lrem_4bit,\@object
 .Lrem_4bit:
@@ -774,10 +1698,24 @@ se_handler:
 	.rva	.LSEH_end_gcm_ghash_4bit
 	.rva	.LSEH_info_gcm_ghash_4bit
 
+	.rva	.LSEH_begin_gcm_init_clmul
+	.rva	.LSEH_end_gcm_init_clmul
+	.rva	.LSEH_info_gcm_init_clmul
+
 	.rva	.LSEH_begin_gcm_ghash_clmul
 	.rva	.LSEH_end_gcm_ghash_clmul
 	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___	if ($avx);
+	.rva	.LSEH_begin_gcm_init_avx
+	.rva	.LSEH_end_gcm_init_avx
+	.rva	.LSEH_info_gcm_init_clmul
 
+	.rva	.LSEH_begin_gcm_ghash_avx
+	.rva	.LSEH_end_gcm_ghash_avx
+	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_gcm_gmult_4bit:
@@ -788,14 +1726,23 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_init_clmul:
+	.byte	0x01,0x08,0x03,0x00
+	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6
+	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18
 .LSEH_info_gcm_ghash_clmul:
-	.byte	0x01,0x1f,0x0b,0x00
-	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
-	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
-	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
-	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
 ___
 }
 
diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 000000000..e76a58c34
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	lis		r0,0xfff0
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$H,$H,$t1		# twisted H
+
+	vsldoi		$H,$H,$H,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	stvx_u		$H, r9,r3
+	stvx_u		$Hh,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo�Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi�Xi.lo+H.lo�Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi�Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subi		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	b		Loop
+
+.align	5
+Loop:
+	 subic		$len,$len,16
+	vpmsumd		$Xl,$IN,$Hl		# H.lo�Xi.lo
+	 subfe.		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H		# H.hi�Xi.lo+H.lo�Xi.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$Hh		# H.hi�Xi.hi
+	 add		$inp,$inp,r0
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,0,$inp
+	 addi		$inp,$inp,16
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	beq		Loop			# did $len-=16 borrow?
+
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl
new file mode 100755
index 000000000..54a1ac4db
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+#		PMULL[2]	32-bit NEON(*)
+# Apple A7	1.76		5.62
+# Cortex-A53	1.45		8.39
+# Cortex-A57	2.22		7.61
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	vld1.64		{$t1},[x1]		@ load H
+	vmov.i8		$t0,#0xe1
+	vext.8		$IN,$t1,$t1,#8
+	vshl.i64	$t0,$t0,#57
+	vshr.u64	$t2,$t0,#63
+	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01
+	vdup.32		$t1,${t1}[1]
+	vshr.u64	$t3,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t3,$t3,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t3,$t3,$t3,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t3		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vst1.64		{$IN},[x0]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$t3,#0xe1
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	vshl.u64	$t3,$t3,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$Hhl,$H,$H,#8
+	mov		$len,#0
+	vext.8		$IN,$t1,$t1,#8
+	mov		$inc,#0
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	mov		$inp,$Xi
+	b		.Lgmult_v8
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	subs		$len,$len,#16
+	vmov.i8		$t3,#0xe1
+	mov		$inc,#16
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	cclr		$inc,eq
+	vext.8		$Xl,$Xl,$Xl,#8
+	vshl.u64	$t3,$t3,#57
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp
+	vext.8		$Hhl,$H,$H,#8
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$t1,$t1
+#endif
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	vext.8		$IN,$t1,$t1,#8
+	b		.Loop_v8
+
+.align	4
+.Loop_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+	vpmull.p64	$Xl,$H,$IN		@ H.lo�Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi�Xi.hi
+	subs		$len,$len,#16
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	cclr		$inc,eq
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$t3		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	veor		$Xl,$Xm,$t2
+	 vext.8		$IN,$t1,$t1,#8
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$t3
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	b.hs		.Loop_v8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+        s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+        print $_,"\n";
+    }
+}
+
+close STDOUT; # enforce flush