20 files changed, 6834 insertions, 3171 deletions
diff --git a/openssl/crypto/modes/Makefile b/openssl/crypto/modes/Makefile
index 3d8bafd57..cbcbfad4b 100644
--- a/openssl/crypto/modes/Makefile
+++ b/openssl/crypto/modes/Makefile
@@ -22,9 +22,9 @@ APPS=
 
 LIB=$(TOP)/libcrypto.a
 LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
-	ccm128.c xts128.c
+	ccm128.c xts128.c wrap128.c
 LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
-	ccm128.o xts128.o $(MODES_ASM_OBJ)
+	ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)
 
 SRC= $(LIBSRC)
 
@@ -50,20 +50,26 @@ ghash-x86.s:	asm/ghash-x86.pl
 	$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
 ghash-x86_64.s:	asm/ghash-x86_64.pl
 	$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-gcm-x86_64.s:	asm/aesni-gcm-x86_64.pl
+	$(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@
 ghash-sparcv9.s:	asm/ghash-sparcv9.pl
 	$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
 ghash-alpha.s:	asm/ghash-alpha.pl
-	(preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+	(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
 	$(PERL) asm/ghash-alpha.pl > $$preproc && \
-	$(CC) -E $$preproc > $@ && rm $$preproc)
-
+	$(CC) -E -P $$preproc > $@ && rm $$preproc)
 ghash-parisc.s:	asm/ghash-parisc.pl
 	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S:	asm/ghashv8-armx.pl
+	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
+ghashp8-ppc.s:	asm/ghashp8-ppc.pl
+	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
 
 # GNU make "catch all"
 ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 
 ghash-armv4.o:	ghash-armv4.S
+ghashv8-armx.o:	ghashv8-armx.S
 
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -137,6 +143,14 @@ ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
 ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
 ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
 ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
+wrap128.o: ../../e_os.h ../../include/openssl/bio.h
+wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h
+wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
+wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
+wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+wrap128.o: ../cryptlib.h wrap128.c
 xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
 xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
 xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100755
index 000000000..7e4e04ea2
--- /dev/null
+++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	sub		\$6,$len
+	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
+	vmovdqu		0x00-0x80($key),$rndkey
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpaddb		$T2,$inout2,$inout3
+	vpaddb		$T2,$inout3,$inout4
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$rndkey,$T1,$inout0
+	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
+	jmp		.Loop6x
+
+.align	32
+.Loop6x:
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32		# discard $inout[1-5]?
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddb	$T2,$inout5,$T1		# next counter value
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpxor		$rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+	vmovdqu		$T1,($ivp)		# save next counter value
+	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
+	  vpxor		$rndkey,$inout3,$inout3
+	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
+	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+	xor		%r12,%r12
+	cmp		$in0,$end0
+
+	  vaesenc	$T2,$inout0,$inout0
+	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
+	  vpxor		$rndkey,$inout4,$inout4
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
+	  vaesenc	$T2,$inout1,$inout1
+	  vpxor		$rndkey,$inout5,$inout5
+	setnc		%r12b
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	  vaesenc	$T2,$inout2,$inout2
+	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
+	neg		%r12
+	  vaesenc	$T2,$inout3,$inout3
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
+	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+	  vaesenc	$T2,$inout4,$inout4
+	 vpxor		$Z1,$T1,$Z0
+	and		\$0x60,%r12
+	  vmovups	0x20-0x80($key),$rndkey
+	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
+	  vaesenc	$T2,$inout5,$inout5
+
+	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
+	lea		($in0,%r12),$in0
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
+	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x58($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x50($in0),%r12
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x20+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x28+8(%rsp)
+	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x30-0x80($key),$rndkey
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
+	  vaesenc	$rndkey,$inout1,$inout1
+	 vpxor		$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout2,$inout2
+	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
+	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	 vpxor		$T1,$Z0,$Z0
+	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x40-0x80($key),$rndkey
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T1,$Ii,$T2
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x48($in0),%r13
+	 vpxor		$Z1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x40($in0),%r12
+	vpclmulqdq	\$0x11,$T1,$Ii,$T1
+	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x30+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x38+8(%rsp)
+	 vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x50-0x80($key),$rndkey
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x38($in0),%r13
+	 vpxor		$T1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T2,$Ii,$T1
+	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x30($in0),%r12
+	vpclmulqdq	\$0x11,$T2,$Ii,$T2
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x40+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x48+8(%rsp)
+	 vpxor		$Hkey,$Z0,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x60-0x80($key),$rndkey
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x28($in0),%r13
+	 vpxor		$T2,$Z3,$Z3
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x20($in0),%r12
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x50+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x58+8(%rsp)
+	vpxor		$Z1,$Z2,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	vpxor		$T1,$Z2,$Z2
+
+	  vmovups	0x70-0x80($key),$rndkey
+	vpslldq		\$8,$Z2,$Z1
+	vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	vpxor		$Xi,$Z3,$Z3
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpxor		$Z1,$Z0,$Z0
+	movbe		0x18($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x10($in0),%r12
+	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	mov		%r13,0x60+8(%rsp)
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r12,0x68+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vmovups	0x90-0x80($key),$rndkey
+	  vaesenc	$T1,$inout1,$inout1
+	vpsrldq		\$8,$Z2,$Z2
+	  vaesenc	$T1,$inout2,$inout2
+	vpxor		$Z2,$Z3,$Z3
+	  vaesenc	$T1,$inout3,$inout3
+	vpxor		$Ii,$Z0,$Z0
+	movbe		0x08($in0),%r13
+	  vaesenc	$T1,$inout4,$inout4
+	movbe		0x00($in0),%r12
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xa0-0x80($key),$T1
+	  cmp		\$11,$rounds
+	  jb		.Lenc_tail		# 128-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xb0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xc0-0x80($key),$T1
+	  je		.Lenc_tail		# 192-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xd0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xe0-0x80($key),$T1
+	  jmp		.Lenc_tail		# 256-bit key
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
+	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
+	  vpaddd	$Z1,$Z2,$inout2
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddd	$Z1,$inout1,$inout3
+	  vpshufb	$Ii,$inout1,$inout1
+	  vpaddd	$Z1,$inout2,$inout4
+	  vpshufb	$Ii,$inout2,$inout2
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpaddd	$Z1,$inout3,$inout5
+	  vpshufb	$Ii,$inout3,$inout3
+	  vpxor		$rndkey,$inout2,$inout2
+	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
+	  vpshufb	$Ii,$inout4,$inout4
+	  vpshufb	$Ii,$inout5,$inout5
+	  vpshufb	$Ii,$T1,$T1		# next counter value
+	jmp		.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	  vaesenc	$rndkey,$inout0,$inout0
+	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
+	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	  vpxor		0x00($inp),$T1,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vpxor		0x10($inp),$T1,$Ii
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vpxor		0x20($inp),$T1,$Z1
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vpxor		0x30($inp),$T1,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	  vpxor		0x40($inp),$T1,$Z3
+	  vpxor		0x50($inp),$T1,$Hkey
+	  vmovdqu	($ivp),$T1		# load next counter value
+
+	  vaesenclast	$T2,$inout0,$inout0
+	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
+	  vaesenclast	$Ii,$inout1,$inout1
+	 vpaddb		$T2,$T1,$Ii
+	mov		%r13,0x70+8(%rsp)
+	lea		0x60($inp),$inp
+	  vaesenclast	$Z1,$inout2,$inout2
+	 vpaddb		$T2,$Ii,$Z1
+	mov		%r12,0x78+8(%rsp)
+	lea		0x60($out),$out
+	  vmovdqu	0x00-0x80($key),$rndkey
+	  vaesenclast	$Z2,$inout3,$inout3
+	 vpaddb		$T2,$Z1,$Z2
+	  vaesenclast	$Z3, $inout4,$inout4
+	 vpaddb		$T2,$Z2,$Z3
+	  vaesenclast	$Hkey,$inout5,$inout5
+	 vpaddb		$T2,$Z3,$Hkey
+
+	add		\$0x60,$ret
+	sub		\$0x6,$len
+	jc		.L6x_done
+
+	  vmovups	$inout0,-0x60($out)	# save output
+	 vpxor		$rndkey,$T1,$inout0
+	  vmovups	$inout1,-0x50($out)
+	 vmovdqa	$Ii,$inout1		# 0 latency
+	  vmovups	$inout2,-0x40($out)
+	 vmovdqa	$Z1,$inout2		# 0 latency
+	  vmovups	$inout3,-0x30($out)
+	 vmovdqa	$Z2,$inout3		# 0 latency
+	  vmovups	$inout4,-0x20($out)
+	 vmovdqa	$Z3,$inout4		# 0 latency
+	  vmovups	$inout5,-0x10($out)
+	 vmovdqa	$Hkey,$inout5		# 0 latency
+	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
+	jmp		.Loop6x
+
+.L6x_done:
+	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
+	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+
+	ret
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+#		const AES_KEY *key, unsigned char iv[16],
+#		struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@function,6
+.align	32
+aesni_gcm_decrypt:
+	xor	$ret,$ret
+	cmp	\$0x60,$len			# minimal accepted length
+	jb	.Lgcm_dec_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	vmovdqu		($Xip),$Xi		# load Xi
+	and		\$-128,%rsp		# ensure stack alignment
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	lea		0x80($key),$key		# size optimization
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	mov		0xf0-0x80($key),$rounds
+	vpshufb		$Ii,$Xi,$Xi
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+	vmovdqu		0x50($inp),$Z3		# I[5]
+	lea		($inp),$in0
+	vmovdqu		0x40($inp),$Z0
+	lea		-0xc0($inp,$len),$end0
+	vmovdqu		0x30($inp),$Z1
+	shr		\$4,$len
+	xor		$ret,$ret
+	vmovdqu		0x20($inp),$Z2
+	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		0x10($inp),$T2
+	 vpshufb	$Ii,$Z0,$Z0
+	vmovdqu		($inp),$Hkey
+	 vpshufb	$Ii,$Z1,$Z1
+	vmovdqu		$Z0,0x30(%rsp)
+	 vpshufb	$Ii,$Z2,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	 vpshufb	$Ii,$T2,$T2
+	vmovdqu		$Z2,0x50(%rsp)
+	 vpshufb	$Ii,$Hkey,$Hkey
+	vmovdqu		$T2,0x60(%rsp)
+	vmovdqu		$Hkey,0x70(%rsp)
+
+	call		_aesni_ctr32_ghash_6x
+
+	vmovups		$inout0,-0x60($out)	# save output
+	vmovups		$inout1,-0x50($out)
+	vmovups		$inout2,-0x40($out)
+	vmovups		$inout3,-0x30($out)
+	vmovups		$inout4,-0x20($out)
+	vmovups		$inout5,-0x10($out)
+
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xd8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_dec_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type	_aesni_ctr32_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	lea		-1($rounds),%r13
+	vmovups		0x10-0x80($key),$rndkey
+	lea		0x20-0x80($key),%r12
+	vpxor		$Z0,$T1,$inout0
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32_2
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddb		$T2,$inout2,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddb		$T2,$inout3,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpaddb		$T2,$inout5,$T1
+	vpxor		$Z0,$inout5,$inout5
+	jmp		.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc		$rndkey,$inout0,$inout0
+	vaesenc		$rndkey,$inout1,$inout1
+	vaesenc		$rndkey,$inout2,$inout2
+	vaesenc		$rndkey,$inout3,$inout3
+	vaesenc		$rndkey,$inout4,$inout4
+	vaesenc		$rndkey,$inout5,$inout5
+	vmovups		(%r12),$rndkey
+	lea		0x10(%r12),%r12
+	dec		%r13d
+	jnz		.Loop_ctr32
+
+	vmovdqu		(%r12),$Hkey		# last round key
+	vaesenc		$rndkey,$inout0,$inout0
+	vpxor		0x00($inp),$Hkey,$Z0
+	vaesenc		$rndkey,$inout1,$inout1
+	vpxor		0x10($inp),$Hkey,$Z1
+	vaesenc		$rndkey,$inout2,$inout2
+	vpxor		0x20($inp),$Hkey,$Z2
+	vaesenc		$rndkey,$inout3,$inout3
+	vpxor		0x30($inp),$Hkey,$Xi
+	vaesenc		$rndkey,$inout4,$inout4
+	vpxor		0x40($inp),$Hkey,$T2
+	vaesenc		$rndkey,$inout5,$inout5
+	vpxor		0x50($inp),$Hkey,$Hkey
+	lea		0x60($inp),$inp
+
+	vaesenclast	$Z0,$inout0,$inout0
+	vaesenclast	$Z1,$inout1,$inout1
+	vaesenclast	$Z2,$inout2,$inout2
+	vaesenclast	$Xi,$inout3,$inout3
+	vaesenclast	$T2,$inout4,$inout4
+	vaesenclast	$Hkey,$inout5,$inout5
+	vmovups		$inout0,0x00($out)
+	vmovups		$inout1,0x10($out)
+	vmovups		$inout2,0x20($out)
+	vmovups		$inout3,0x30($out)
+	vmovups		$inout4,0x40($out)
+	vmovups		$inout5,0x50($out)
+	lea		0x60($out),$out
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
+	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
+	vpaddd		$Z1,$Z2,$inout2
+	vpaddd		$Z1,$inout1,$inout3
+	vpshufb		$Ii,$inout1,$inout1
+	vpaddd		$Z1,$inout2,$inout4
+	vpshufb		$Ii,$inout2,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddd		$Z1,$inout3,$inout5
+	vpshufb		$Ii,$inout3,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
+	vpshufb		$Ii,$inout4,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpshufb		$Ii,$inout5,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpshufb		$Ii,$T1,$T1		# next counter value
+	vpxor		$Z0,$inout5,$inout5
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@function,6
+.align	32
+aesni_gcm_encrypt:
+	xor	$ret,$ret
+	cmp	\$0x60*3,$len			# minimal accepted length
+	jb	.Lgcm_enc_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	lea		0x80($key),$key		# size optimization
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	and		\$-128,%rsp		# ensure stack alignment
+	mov		0xf0-0x80($key),$rounds
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+	lea		($out),$in0
+	lea		-0xc0($out,$len),$end0
+	shr		\$4,$len
+
+	call		_aesni_ctr32_6x
+	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
+	vpshufb		$Ii,$inout1,$T2
+	vmovdqu		$Xi,0x70(%rsp)
+	vpshufb		$Ii,$inout2,$Z0
+	vmovdqu		$T2,0x60(%rsp)
+	vpshufb		$Ii,$inout3,$Z1
+	vmovdqu		$Z0,0x50(%rsp)
+	vpshufb		$Ii,$inout4,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		$Z2,0x30(%rsp)
+
+	call		_aesni_ctr32_6x
+
+	vmovdqu		($Xip),$Xi		# load Xi
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	sub		\$12,$len
+	mov		\$0x60*2,$ret
+	vpshufb		$Ii,$Xi,$Xi
+
+	call		_aesni_ctr32_ghash_6x
+	vmovdqu		0x20(%rsp),$Z3		# I[5]
+	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	vpunpckhqdq	$Z3,$Z3,$T1
+	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
+	 vmovups	$inout0,-0x60($out)	# save output
+	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
+	vpxor		$Z3,$T1,$T1
+	 vmovups	$inout1,-0x50($out)
+	 vpshufb	$Ii,$inout1,$inout1
+	 vmovups	$inout2,-0x40($out)
+	 vpshufb	$Ii,$inout2,$inout2
+	 vmovups	$inout3,-0x30($out)
+	 vpshufb	$Ii,$inout3,$inout3
+	 vmovups	$inout4,-0x20($out)
+	 vpshufb	$Ii,$inout4,$inout4
+	 vmovups	$inout5,-0x10($out)
+	 vpshufb	$Ii,$inout5,$inout5
+	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+	 vmovdqu	0x30(%rsp),$Z2		# I[4]
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	 vpunpckhqdq	$Z2,$Z2,$T2
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
+	 vpxor		$Z2,$T2,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+
+	 vmovdqu	0x40(%rsp),$T3		# I[3]
+	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$T3,$T3,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
+	 vpxor		$T3,$Z1,$Z1
+	vpxor		$Z3,$Z2,$Z2
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	 vmovdqu	0x50(%rsp),$T1		# I[2]
+	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z0,$Z3,$Z3
+	 vpunpckhqdq	$T1,$T1,$Z0
+	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
+	 vpxor		$T1,$Z0,$Z0
+	vpxor		$Z2,$T3,$T3
+	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
+	vpxor		$T2,$Z1,$Z1
+
+	 vmovdqu	0x60(%rsp),$T2		# I[1]
+	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z3,$Z2,$Z2
+	 vpunpckhqdq	$T2,$T2,$Z3
+	vpclmulqdq	\$0x11,$Ii,$T1,$T1
+	 vpxor		$T2,$Z3,$Z3
+	vpxor		$T3,$T1,$T1
+	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$Z1,$Z0,$Z0
+
+	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
+	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpxor		$Z2,$Z1,$Z1
+	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$T1,$T2,$T2
+	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
+	vpxor		$Z0,$Z3,$Z0
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
+	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$inout5,$inout5,$T1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
+	 vpxor		$inout5,$T1,$T1
+	vpxor		$Z1,$Z2,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$T3
+	 vmovdqu	0x20-0x20($Xip),$HK
+	vpxor		$T2,$Xi,$Z3
+	vpxor		$Z0,$T3,$Z2
+
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
+	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
+	  vpxor		$T3,$Z2,$Z2
+	 vpunpckhqdq	$inout4,$inout4,$T2
+	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
+	 vpxor		$inout4,$T2,$T2
+	  vpslldq	\$8,$Z2,$T3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+	  vpxor		$T3,$Z1,$Xi
+	  vpsrldq	\$8,$Z2,$Z2
+	  vpxor		$Z2,$Z3,$Z3
+
+	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout3,$inout3,$T3
+	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
+	 vpxor		$inout3,$T3,$T3
+	vpxor		$inout5,$inout4,$inout4
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$inout2,$inout2,$T1
+	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
+	 vpxor		$inout2,$T1,$T1
+	vpxor		$inout4,$inout3,$inout3
+	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
+	vpclmulqdq	\$0x00,$HK,$T3,$T3
+	vpxor		$T2,$T3,$T3
+
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout1,$inout1,$T2
+	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
+	 vpxor		$inout1,$T2,$T2
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
+	vpxor		$inout3,$inout2,$inout2
+	vpclmulqdq	\$0x10,$HK,$T1,$T1
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$T3,$T1,$T1
+
+	  vxorps	$Z3,$inout5,$inout5
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$inout2,$inout1,$inout1
+	vpclmulqdq	\$0x00,$HK,$T2,$T2
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
+	vpxor		$Z0,$Z1,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$Z2
+	vpxor		$inout1,$Z3,$Z3
+	vpxor		$T2,$Z2,$Z2
+
+	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
+	vpxor		$Z0,$Z2,$Z2
+	vpslldq		\$8,$Z2,$T1
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+	vpsrldq		\$8,$Z2,$Z2
+	vpxor		$T1,$Z1,$Xi
+	vpxor		$Z2,$Z3,$Z3
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$Z3,$T2,$T2
+	vpxor		$T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xc8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_enc_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern	__imp_RtlVirtualUnwind
+.type	gcm_se_handler,\@abi-omnipotent
+.align	16
+gcm_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	120($context),%rax	# pull context->Rax
+
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	mov	%r15,240($context)
+	mov	%r14,232($context)
+	mov	%r13,224($context)
+	mov	%r12,216($context)
+	mov	%rbp,160($context)
+	mov	%rbx,144($context)
+
+	lea	-0xd8(%rax),%rsi	# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	gcm_se_handler,.-gcm_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_gcm_decrypt
+	.rva	.LSEH_end_aesni_gcm_decrypt
+	.rva	.LSEH_gcm_dec_info
+
+	.rva	.LSEH_begin_aesni_gcm_encrypt
+	.rva	.LSEH_end_aesni_gcm_encrypt
+	.rva	.LSEH_gcm_enc_info
+.section	.xdata
+.align	8
+.LSEH_gcm_dec_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___;	# assembler is too old
+.text
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl
index d91586ee2..77fbf3446 100644
--- a/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
 # Add NEON implementation featuring polynomial multiplication, i.e. no
 # lookup tables involved. On Cortex A8 it was measured to process one
 # byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# C�mara, D.; Gouv�a, C. P. L.; L�pez, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 # ====================================================================
 # Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,161 @@ $code.=<<___;
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
 {
-my $cnt=$Htbl;	# $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
 
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
 
 $code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
 .fpu	neon
 
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1,:64]!	@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1,:64]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	ret					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
 .global	gcm_gmult_neon
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
-	sub		$Htbl,#16		@ point at H in GCM128_CTX
-	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
-	veor		$zero,$zero
+	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$IN#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
 #ifdef __ARMEL__
 	vrev64.8	$IN,$IN
 #endif
-	veor		$Qpost,$Qpost
-	veor		$R,$R
-	mov		$cnt,#16
-	veor		$Z,$Z
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
 	mov		$len,#16
-	veor		$Zo,$Zo
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	b		.Linner_neon
+	b		.Lgmult_neon
 .size	gcm_gmult_neon,.-gcm_gmult_neon
 
 .global	gcm_ghash_neon
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
-	veor		$zero,$zero
-	nop
+	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$Xl#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
 #ifdef __ARMEL__
-	vrev64.8	$Z,$Z
+	vrev64.8	$Xl,$Xl
 #endif
-.Louter_neon:
-	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
-	veor		$Qpost,$Qpost
-	vld1.64		`&Dlo($IN)`,[$inp]!
-	veor		$R,$R
-	mov		$cnt,#16
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
 #ifdef __ARMEL__
 	vrev64.8	$IN,$IN
 #endif
-	veor		$Zo,$Zo
-	veor		$IN,$Z			@ inp^=Xi
-	veor		$Z,$Z
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-.Linner_neon:
-	subs		$cnt,$cnt,#1
-	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo�Xi[i]
-	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi�Xi[i]
-	vext.8		$IN,$zero,#1		@ IN>>=8
-
-	veor		$Z,$Qpost		@ modulo-scheduled part
-	vshl.i64	`&Dlo("$R")`,#48
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-	vuzp.8		$Qlo,$Qhi
-	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
-	vext.8		$Z,$zero,#1		@ Z>>=8
-
-	vmull.p8	$R,$Zo,$mod		@ "carry"�0xe1
-	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
-	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
-	veor		$Z,$Qhi
-	bne		.Linner_neon
-
-	veor		$Z,$Qpost		@ modulo-scheduled artefact
-	vshl.i64	`&Dlo("$R")`,#48
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-
-	@ finalization, normalize Z:Zo
-	vand		$Zo,$mod		@ suffices to mask the bit
-	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
-	vshl.i64	$Z,#1
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo�Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi�Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
 	subs		$len,#16
-	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
-	bne		.Louter_neon
+	bne		.Loop_neon
 
 #ifdef __ARMEL__
-	vrev64.8	$Z,$Z
+	vrev64.8	$Xl,$Xl
 #endif
 	sub		$Xi,#16	
-	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
-	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
+	vst1.64		$Xl#lo,[$Xi,:64]
 
-	bx	lr
+	ret					@ bx lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
@@ -423,7 +481,13 @@ $code.=<<___;
 .align  2
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
 close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl
index 6a40d5d89..39096b423 100644
--- a/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -186,13 +186,13 @@ $code.=<<___;
 	sllg	$rem1,$Zlo,3
 	xgr	$Zlo,$tmp
 	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
 	j	.Lghash_inner
 .align	16
 .Lghash_inner:
 	srlg	$Zlo,$Zlo,4
-	sllg	$tmp,$Zhi,60
-	xg	$Zlo,8($nlo,$Htbl)
 	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
 	llgc	$xi,0($cnt,$Xi)
 	xg	$Zhi,0($nlo,$Htbl)
 	sllg	$nlo,$xi,4
@@ -213,9 +213,9 @@ $code.=<<___;
 	sllg	$rem1,$Zlo,3
 	xgr	$Zlo,$tmp
 	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
 	brct	$cnt,.Lghash_inner
 
-	sllg	$tmp,$Zhi,60
 	srlg	$Zlo,$Zlo,4
 	srlg	$Zhi,$Zhi,4
 	xg	$Zlo,8($nlo,$Htbl)
diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl
index 70e7b044a..0365e0f1f 100644
--- a/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -36,6 +36,15 @@
 # references to input data and Z.hi updates to achieve 12 cycles
 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
 
 $bits=32;
 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -66,6 +75,10 @@ $Htbl="%i1";
 $inp="%i2";
 $len="%i3";
 
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
 $code.=<<___;
 .section	".text",#alloc,#execinstr
 
@@ -321,10 +334,238 @@ gcm_gmult_4bit:
 	restore
 .type	gcm_gmult_4bit,#function
 .size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl	gcm_init_vis3
+.align	32
+gcm_init_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[%i1+0],$Hhi
+	ldx	[%i1+8],$Hlo
+	mov	0xE1,$Xhi
+	mov	1,$Xlo
+	sllx	$Xhi,57,$Xhi
+	srax	$Hhi,63,$C0		! broadcast carry
+	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
+	addxc	$Hhi,$Hhi,$Hhi
+	and	$C0,$Xlo,$Xlo
+	and	$C0,$Xhi,$Xhi
+	xor	$Xlo,$Hlo,$Hlo
+	xor	$Xhi,$Hhi,$Hhi
+	stx	$Hlo,[%i0+8]		! save twisted H
+	stx	$Hhi,[%i0+0]
+
+	sethi	%hi(0xA0406080),$V
+	sethi	%hi(0x20C0E000),%l0
+	or	$V,%lo(0xA0406080),$V
+	or	%l0,%lo(0x20C0E000),%l0
+	sllx	$V,32,$V
+	or	%l0,$V,$V		! (0xE0�i)&0xff=0xA040608020C0E000
+	stx	$V,[%i0+16]
+
+	ret
+	restore
+.type	gcm_init_vis3,#function
+.size	gcm_init_vis3,.-gcm_init_vis3
+
+.globl	gcm_gmult_vis3
+.align	32
+gcm_gmult_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[$Xip+8],$Xlo		! load Xi
+	ldx	[$Xip+0],$Xhi
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0�i)&0xff=0xA040608020C0E000
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! �0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0�0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! �0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_gmult_vis3,#function
+.size	gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl	gcm_ghash_vis3
+.align	32
+gcm_ghash_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[$Xip+8],$C2		! load Xi
+	ldx	[$Xip+0],$C3
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0�i)&0xff=0xA040608020C0E000
+
+	and	$inp,7,$shl
+	andn	$inp,7,$inp
+	sll	$shl,3,$shl
+	prefetch [$inp+63], 20
+	sub	%g0,$shl,$shr
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+.Loop:
+	ldx	[$inp+8],$Xlo
+	brz,pt	$shl,1f
+	ldx	[$inp+0],$Xhi
+
+	ldx	[$inp+16],$C1		! align data
+	srlx	$Xlo,$shr,$C0
+	sllx	$Xlo,$shl,$Xlo
+	sllx	$Xhi,$shl,$Xhi
+	srlx	$C1,$shr,$C1
+	or	$C0,$Xhi,$Xhi
+	or	$C1,$Xlo,$Xlo
+1:
+	add	$inp,16,$inp
+	sub	$len,16,$len
+	xor	$C2,$Xlo,$Xlo
+	xor	$C3,$Xhi,$Xhi
+	prefetch [$inp+63], 20
+
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! �0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0�0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! �0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	brnz,pt	$len,.Loop
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_ghash_vis3,#function
+.size	gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 .align	4
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"xmulx"		=> 0x115,
+		"xmulxhi"	=> 0x116	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
 close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl
index 83c727e07..23a5527b3 100644
--- a/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/openssl/crypto/modes/asm/ghash-x86.pl
@@ -12,25 +12,27 @@
 # The module implements "4-bit" GCM GHASH function and underlying
 # single multiplication operation in GF(2^128). "4-bit" means that it
 # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
 # "528B" variant of "4-bit" method utilizing additional 256+16 bytes
 # of per-key storage [+512 bytes shared table]. Performance results
 # are for streamed GHASH subroutine and are expressed in cycles per
 # processed byte, less is better:
 #
-#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#		gcc 2.95.3(*)	SSE assembler	x86 assembler
 #
 # Pentium	105/111(**)	-		50
 # PIII		68 /75		12.2		24
 # P4		125/125		17.8		84(***)
 # Opteron	66 /70		10.1		30
 # Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
 #	which is one of reasons why 2.95.3 results were chosen,
 #	another reason is lack of 3.4.x results for older CPUs;
-#	comparison with MMX results is not completely fair, because C
+#	comparison with SSE results is not completely fair, because C
 #	results are for vanilla "256B" implementation, while
 #	assembler results are for "528B";-)
 # (**)	second number is result for code compiled with -fPIC flag,
@@ -40,8 +42,8 @@
 #
 # To summarize, it's >2-5 times faster than gcc-generated code. To
 # anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
 
 # May 2010
 #
@@ -113,6 +115,16 @@
 # similar manner resulted in almost 20% degradation on Sandy Bridge,
 # where original 64-bit code processes one byte in 1.95 cycles.
 
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -822,17 +834,18 @@ $len="ebx";
 &static_label("bswap");
 
 sub clmul64x64_T2 {	# minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
 	&movdqa		($Xhi,$Xi);		#
 	&pshufd		($T1,$Xi,0b01001110);
-	&pshufd		($T2,$Hkey,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
 	&pxor		($T1,$Xi);		#
-	&pxor		($T2,$Hkey);
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
 
 	&pclmulqdq	($Xi,$Hkey,0x00);	#######
 	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&pclmulqdq	($T1,$T2,0x00);		#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
 	&xorps		($T1,$Xi);		#
 	&xorps		($T1,$Xhi);		#
 
@@ -879,31 +892,32 @@ if (1) {		# Algorithm 9 with <<1 twist.
 			# below. Algorithm 9 was therefore chosen for
 			# further optimization...
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 	# 1st phase
-	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
 	&psllq		($Xi,1);
 	&pxor		($Xi,$T1);		#
-	&psllq		($Xi,5);		#
-	&pxor		($Xi,$T1);		#
 	&psllq		($Xi,57);		#
-	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);		#
 	&pslldq		($Xi,8);
-	&psrldq		($T2,8);		#
-	&pxor		($Xi,$T1);
-	&pxor		($Xhi,$T2);		#
+	&psrldq		($T1,8);		#	
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
 
 	# 2nd phase
 	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
 	&psrlq		($Xi,5);
 	&pxor		($Xi,$T2);		#
 	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
-	&pxor		($T2,$Xhi);
-	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
+	&pxor		($Xi,$Xhi)		#
 }
 
 &function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
 	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
 	&reduction_alg9	($Xhi,$Xi);
 
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
 	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
 	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
 
 	&ret		();
 &function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
 	&movdqa		($T3,&QWP(0,$const));
 	&movups		($Hkey,&QWP(0,$Htbl));
 	&pshufb		($Xi,$T3);
+	&movups		($T2,&QWP(32,$Htbl));
 
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
 	&reduction_alg9	($Xhi,$Xi);
 
 	&pshufb		($Xi,$T3);
@@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;
 	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
 	&pshufb		($T1,$T3);
 	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
 	&pxor		($Xi,$T1);		# Ii+Xi
 
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T3,0x00);		#######
 	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
 
-	&lea		($inp,&DWP(32,$inp));	# i+=2
 	&sub		($len,0x20);
 	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
 
-&set_label("mod_loop");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+	&nop		();
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
 
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
 
-	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
-	&movdqa		($Xhn,$Xn);
-	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
 
-	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
 	  &psllq	($Xi,1);
 	  &pxor		($Xi,$T1);		#
-	  &psllq	($Xi,5);		#
-	  &pxor		($Xi,$T1);		#
 	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
 	  &psllq	($Xi,57);		#
-	  &movdqa	($T2,$Xi);		#
+	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
-	  &psrldq	($T2,8);		#	
-	  &pxor		($Xi,$T1);
-	&pshufd		($T1,$T3,0b01001110);
+	  &psrldq	($T1,8);		#	
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
 	  &pxor		($Xhi,$T2);		#
-	&pxor		($T1,$T3);
-	&pshufd		($T3,$Hkey,0b01001110);
-	&pxor		($T3,$Hkey);		#
-
 	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
-	  &movdqa	($T2,$Xi);		# 2nd phase
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($T2,$Xi);
 	  &psrlq	($Xi,5);
 	  &pxor		($Xi,$T2);		#
 	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-	  &pxor		($T2,$Xhi);
-	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-
+	  &pxor		($Xi,$Xhi)		#
 	&pclmulqdq	($T1,$T3,0x00);		#######
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-	&xorps		($T1,$Xn);		#
-	&xorps		($T1,$Xhn);		#
-
-	&movdqa		($T3,$T1);		#
-	&psrldq		($T1,8);
-	&pslldq		($T3,8);		#
-	&pxor		($Xhn,$T1);
-	&pxor		($Xn,$T3);		#
-	&movdqa		($T3,&QWP(0,$const));
 
 	&lea		($inp,&DWP(32,$inp));
 	&sub		($len,0x20);
 	&ja		(&label("mod_loop"));
 
 &set_label("even_tail");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
 
 	&reduction_alg9	($Xhi,$Xi);
 
@@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;
 &set_label("bswap",64);
 	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
-}}	# $sse2
-
-&set_label("rem_4bit",64);
-	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
-	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
-	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
-	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 &set_label("rem_8bit",64);
 	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
 	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;
 	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
 	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
 	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
 }}}	# !$x86only
 
 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl
index 38d779edb..6e656ca13 100644
--- a/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
 # P4		28.6		14.0		+100%
 # Opteron	19.3		7.7		+150%
 # Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
 #
 # (*)	comparison is not completely fair, because C results are
 #	for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,44 @@
 # providing access to a Westmere-based system on behalf of Intel
 # Open Source Technology Centre.
 
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
+$do4xaggr=1;
+
 # common register layout
 $nlo="%rax";
 $nhi="%rbx";
@@ -160,6 +221,7 @@ ___
 
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	gcm_gmult_4bit
 .type	gcm_gmult_4bit,\@function,2
@@ -352,19 +414,27 @@ ___
 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
 
 sub clmul64x64_T2 {	# minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
 	movdqa		$Xi,$Xhi		#
 	pshufd		\$0b01001110,$Xi,$T1
 	pshufd		\$0b01001110,$Hkey,$T2
 	pxor		$Xi,$T1			#
 	pxor		$Hkey,$T2
 ___
+} else {
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1			#
+___
+}
 $code.=<<___;
 	pclmulqdq	\$0x00,$Hkey,$Xi	#######
 	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
-	pclmulqdq	\$0x00,$T2,$T1		#######
+	pclmulqdq	\$0x00,$HK,$T1		#######
 	pxor		$Xi,$T1			#
 	pxor		$Xhi,$T1		#
 
@@ -376,42 +446,53 @@ $code.=<<___;
 ___
 }
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 $code.=<<___;
 	# 1st phase
-	movdqa		$Xi,$T1			#
+	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1
+	psllq		\$5,$Xi
+	pxor		$Xi,$T1			#
 	psllq		\$1,$Xi
 	pxor		$T1,$Xi			#
-	psllq		\$5,$Xi			#
-	pxor		$T1,$Xi			#
 	psllq		\$57,$Xi		#
-	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1			#
 	pslldq		\$8,$Xi
-	psrldq		\$8,$T2			#	
-	pxor		$T1,$Xi
-	pxor		$T2,$Xhi		#
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
 
 	# 2nd phase
 	movdqa		$Xi,$T2
+	psrlq		\$1,$Xi
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
 	psrlq		\$5,$Xi
 	pxor		$T2,$Xi			#
 	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
-	pxor		$Xhi,$T2
-	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
+	pxor		$Xhi,$Xi		#
 ___
 }
 
 { my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
 
 $code.=<<___;
 .globl	gcm_init_clmul
 .type	gcm_init_clmul,\@abi-omnipotent
 .align	16
 gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
 	movdqu		($Xip),$Hkey
 	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
 
@@ -430,13 +511,47 @@ gcm_init_clmul:
 	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
 
 	# calculate H^2
+	pshufd		\$0b01001110,$Hkey,$HK
 	movdqa		$Hkey,$Xi
+	pxor		$Hkey,$HK
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
-	movdqu		$Hkey,($Htbl)		# save H
-	movdqu		$Xi,16($Htbl)		# save H^2
+	pshufd		\$0b01001110,$Hkey,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$Hkey,$T1		# Karatsuba pre-processing
+	movdqu		$Hkey,0x00($Htbl)	# save H
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x10($Htbl)		# save H^2
+	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
+	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
+___
+if ($do4xaggr) {
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqa		$Xi,$T3
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$T3,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$T3,$T1			# Karatsuba pre-processing
+	movdqu		$T3,0x30($Htbl)		# save H^3
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x40($Htbl)		# save H^4
+	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
+	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
 	ret
 .size	gcm_init_clmul,.-gcm_init_clmul
 ___
@@ -449,13 +564,38 @@ $code.=<<___;
 .type	gcm_gmult_clmul,\@abi-omnipotent
 .align	16
 gcm_gmult_clmul:
+.L_gmult_clmul:
 	movdqu		($Xip),$Xi
 	movdqa		.Lbswap_mask(%rip),$T3
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$T2
 	pshufb		$T3,$Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+	# experimental alternative. special thing about is that there
+	# no dependency between the two multiplications... 
+	mov		\$`0xE1<<1`,%eax
+	mov		\$0xA040608020C0E000,%r10	# ((7..0)�0xE0)&0xff
+	mov		\$0x07,%r11d
+	movq		%rax,$T1
+	movq		%r10,$T2
+	movq		%r11,$T3		# borrow $T3
+	pand		$Xi,$T3
+	pshufb		$T3,$T2			# ($Xi&7)�0xE0
+	movq		%rax,$T3
+	pclmulqdq	\$0x00,$Xi,$T1		# �(0xE1<<1)
+	pxor		$Xi,$T2
+	pslldq		\$15,$T2
+	paddd		$T2,$T2			# <<(64+56+1)
+	pxor		$T2,$Xi
+	pclmulqdq	\$0x01,$T3,$Xi
+	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
+	psrldq		\$1,$T1
+	pxor		$T1,$Xhi
+	pslldq		\$7,$Xi
+	pxor		$Xhi,$Xi
+___
 $code.=<<___;
 	pshufb		$T3,$Xi
 	movdqu		$Xi,($Xip)
@@ -465,129 +605,327 @@ ___
 }
 
 { my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my $Xn="%xmm6";
-  my $Xhn="%xmm7";
-  my $Hkey2="%xmm8";
-  my $T1n="%xmm9";
-  my $T2n="%xmm10";
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
 
 $code.=<<___;
 .globl	gcm_ghash_clmul
 .type	gcm_ghash_clmul,\@abi-omnipotent
-.align	16
+.align	32
 gcm_ghash_clmul:
+.L_ghash_clmul:
 ___
 $code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
 .LSEH_begin_gcm_ghash_clmul:
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
-	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
-	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
-	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
-	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
-	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
 ___
 $code.=<<___;
 	movdqa		.Lbswap_mask(%rip),$T3
 
 	movdqu		($Xip),$Xi
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$HK
 	pshufb		$T3,$Xi
 
 	sub		\$0x10,$len
 	jz		.Lodd_tail
 
-	movdqu		16($Htbl),$Hkey2
+	movdqu		0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+	mov		OPENSSL_ia32cap_P+4(%rip),%eax
+	cmp		\$0x30,$len
+	jb		.Lskip4x
+
+	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
+	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
+	je		.Lskip4x
+
+	sub		\$0x30,$len
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)�0xE0)&0xff
+	movdqu		0x30($Htbl),$Hkey3
+	movdqu		0x40($Htbl),$Hkey4
+
+	#######
+	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+	#
+	movdqu		0x30($inp),$Xln
+	 movdqu		0x20($inp),$Xl
+	pshufb		$T3,$Xln
+	 pshufb		$T3,$Xl
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey2,$Xl
+	pclmulqdq	\$0x11,$Hkey2,$Xh
+	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+	movups		0x50($Htbl),$HK
+	xorps		$Xm,$Xmn
+
+	movdqu		0x10($inp),$Xl
+	 movdqu		0($inp),$T1
+	pshufb		$T3,$Xl
+	 pshufb		$T3,$T1
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$T1,$Xi
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey3,$Xl
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pxor		$Xi,$T1
+	pclmulqdq	\$0x11,$Hkey3,$Xh
+	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	 movdqu		0x30($inp),$Xl
+	 pshufb		$T3,$Xl
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	 movdqu		0x20($inp),$Xln
+	 movdqa		$Xl,$Xh
+	pclmulqdq	\$0x10,$HK,$T1
+	 pshufd		\$0b01001110,$Xl,$Xm
+	xorps		$Xhn,$Xhi
+	 pxor		$Xl,$Xm
+	 pshufb		$T3,$Xln
+	movups		0x20($Htbl),$HK
+	xorps		$Xmn,$T1
+	 pclmulqdq	\$0x00,$Hkey,$Xl
+	 pshufd		\$0b01001110,$Xln,$Xmn
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	 movdqa		$Xln,$Xhn
+	pxor		$Xhi,$T1		#
+	 pxor		$Xln,$Xmn
+	movdqa		$T1,$T2			#
+	 pclmulqdq	\$0x11,$Hkey,$Xh
+	pslldq		\$8,$T1
+	psrldq		\$8,$T2			#
+	pxor		$T1,$Xi
+	movdqa		.L7_mask(%rip),$T1
+	pxor		$T2,$Xhi		#
+	movq		%rax,$T2
+
+	pand		$Xi,$T1			# 1st phase
+	pshufb		$T1,$T2			#
+	pxor		$Xi,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
+	psllq		\$57,$T2		#
+	movdqa		$T2,$T1			#
+	pslldq		\$8,$T2
+	 pclmulqdq	\$0x00,$Hkey2,$Xln
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+	movdqu		0($inp),$T1
+
+	movdqa		$Xi,$T2			# 2nd phase
+	psrlq		\$1,$Xi
+	 pclmulqdq	\$0x11,$Hkey2,$Xhn
+	 xorps		$Xl,$Xln
+	 movdqu		0x10($inp),$Xl
+	 pshufb		$T3,$Xl
+	 pclmulqdq	\$0x10,$HK,$Xmn
+	 xorps		$Xh,$Xhn
+	 movups		0x50($Htbl),$HK
+	pshufb		$T3,$T1
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+
+	 movdqa		$Xl,$Xh
+	 pxor		$Xm,$Xmn
+	 pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$T2,$Xi			#
+	pxor		$T1,$Xhi
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+	movdqa		$Xi,$Xhi
+	 pclmulqdq	\$0x11,$Hkey3,$Xh
+	 xorps		$Xl,$Xln
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
+
+	 pclmulqdq	\$0x00,$HK,$Xm
+	 xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xm,$Xmn
+	xorps		$Xln,$Xi
+	xorps		$Xhn,$Xhi
+	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
+	pxor		$Xmn,$T1
+
+	pxor		$Xhi,$T1		#
+	pxor		$Xi,$Xhi
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+	&reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+	add	\$0x40,$len
+	jz	.Ldone
+	movdqu	0x20($Htbl),$HK
+	sub	\$0x10,$len
+	jz	.Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
 	#######
 	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 	#	[(H*Ii+1) + (H*Xi+1)] mod P =
 	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
 	#
 	movdqu		($inp),$T1		# Ii
-	movdqu		16($inp),$Xn		# Ii+1
+	movdqu		16($inp),$Xln		# Ii+1
 	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pshufb		$T3,$Xln
 	pxor		$T1,$Xi			# Ii+Xi
-___
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
-$code.=<<___;
-	movdqa		$Xi,$Xhi		#
-	pshufd		\$0b01001110,$Xi,$T1
-	pshufd		\$0b01001110,$Hkey2,$T2
-	pxor		$Xi,$T1			#
-	pxor		$Hkey2,$T2
+
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
 
 	lea		32($inp),$inp		# i+=2
+	nop
 	sub		\$0x20,$len
 	jbe		.Leven_tail
+	nop
+	jmp		.Lmod_loop
 
+.align	32
 .Lmod_loop:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	movdqu		($inp),$T1		# Ii
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
-	pxor		$Xhn,$Xhi
+	movdqa		$Xi,$Xhi
+	movdqa		$Xmn,$T1
+	pshufd		\$0b01001110,$Xi,$Xmn	#
+	pxor		$Xi,$Xmn		#
 
-	movdqu		16($inp),$Xn		# Ii+1
-	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
 
-	movdqa		$Xn,$Xhn		#
-	pshufd		\$0b01001110,$Xn,$T1n
-	pshufd		\$0b01001110,$Hkey,$T2n
-	pxor		$Xn,$T1n		#
-	pxor		$Hkey,$T2n
-	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	  movdqu	($inp),$T2		# Ii
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	  pshufb	$T3,$T2
+	  movdqu	16($inp),$Xln		# Ii+1
+
+	pxor		$Xhi,$T1
+	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
+	pxor		$T1,$Xmn
+	 pshufb		$T3,$Xln
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+
+	movdqa		$Xln,$Xhn		#
 
-	  movdqa	$Xi,$T1			# 1st phase
+	  movdqa	$Xi,$T2			# 1st phase
+	  movdqa	$Xi,$T1
+	  psllq		\$5,$Xi
+	  pxor		$Xi,$T1			#
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
 	  psllq		\$1,$Xi
 	  pxor		$T1,$Xi			#
-	  psllq		\$5,$Xi			#
-	  pxor		$T1,$Xi			#
-	pclmulqdq	\$0x00,$Hkey,$Xn	#######
 	  psllq		\$57,$Xi		#
-	  movdqa	$Xi,$T2			#
+	  movdqa	$Xi,$T1			#
 	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T2			#	
-	  pxor		$T1,$Xi
-	  pxor		$T2,$Xhi		#
+	  psrldq	\$8,$T1			#	
+	  pxor		$T2,$Xi
+	pshufd		\$0b01001110,$Xhn,$Xmn
+	  pxor		$T1,$Xhi		#
+	pxor		$Xhn,$Xmn		#
 
-	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$1,$Xi
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  pxor		$T2,$Xhi		#
+	  pxor		$Xi,$T2
 	  psrlq		\$5,$Xi
 	  pxor		$T2,$Xi			#
+	lea		32($inp),$inp
 	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-	  pxor		$Xhi,$T2
-	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
+	pclmulqdq	\$0x00,$HK,$Xmn		#######
+	  pxor		$Xhi,$Xi		#
 
-	pclmulqdq	\$0x00,$T2n,$T1n	#######
-	 movdqa		$Xi,$Xhi		#
-	 pshufd		\$0b01001110,$Xi,$T1
-	 pshufd		\$0b01001110,$Hkey2,$T2
-	 pxor		$Xi,$T1			#
-	 pxor		$Hkey2,$T2
-
-	pxor		$Xn,$T1n		#
-	pxor		$Xhn,$T1n		#
-	movdqa		$T1n,$T2n		#
-	psrldq		\$8,$T1n
-	pslldq		\$8,$T2n		#
-	pxor		$T1n,$Xhn
-	pxor		$T2n,$Xn		#
-
-	lea		32($inp),$inp
 	sub		\$0x20,$len
 	ja		.Lmod_loop
 
 .Leven_tail:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	 movdqa		$Xi,$Xhi
+	 movdqa		$Xmn,$T1
+	 pshufd		\$0b01001110,$Xi,$Xmn	#
+	 pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
+	pxor		$Xi,$T1
+	pxor		$Xhi,$T1
+	pxor		$T1,$Xmn
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
 ___
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
@@ -599,7 +937,7 @@ $code.=<<___;
 	pshufb		$T3,$T1
 	pxor		$T1,$Xi			# Ii+Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
 .Ldone:
@@ -612,21 +950,607 @@ $code.=<<___ if ($win64);
 	movaps	0x20(%rsp),%xmm8
 	movaps	0x30(%rsp),%xmm9
 	movaps	0x40(%rsp),%xmm10
-	add	\$0x58,%rsp
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
 ___
 $code.=<<___;
 	ret
-.LSEH_end_gcm_ghash_clmul:
 .size	gcm_ghash_clmul,.-gcm_ghash_clmul
 ___
 }
+
+$code.=<<___;
+.globl	gcm_init_avx
+.type	gcm_init_avx,\@abi-omnipotent
+.align	32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Hkey
+	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	vpsrlq		\$63,$Hkey,$T1
+	vpsllq		\$1,$Hkey,$Hkey
+	vpxor		$T3,$T3,$T3		#
+	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
+	vpslldq		\$8,$T1,$T1
+	vpor		$T1,$Hkey,$Hkey		# H<<=1
+
+	# magic reduction
+	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
+	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	vpunpckhqdq	$Hkey,$Hkey,$HK
+	vmovdqa		$Hkey,$Xi
+	vpxor		$Hkey,$HK,$HK
+	mov		\$4,%r10		# up to H^8
+	jmp		.Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpunpckhqdq	$Hkey,$Hkey,$T2
+	vpxor		$Xi,$T1,$T1		#
+	vpxor		$Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpxor		$Xi,$T1,$T1		#
+___
+}
+$code.=<<___;
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
+	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
+	vpxor		$Xi,$Xhi,$T2		#
+	vpxor		$T2,$T1,$T1		#
+
+	vpslldq		\$8,$T1,$T2		#
+	vpsrldq		\$8,$T1,$T1
+	vpxor		$T2,$Xi,$Xi		#
+	vpxor		$T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	vpsllq		\$57,$Xi,$T1		# 1st phase
+	vpsllq		\$62,$Xi,$T2
+	vpxor		$T1,$T2,$T2		#
+	vpsllq		\$63,$Xi,$T1
+	vpxor		$T1,$T2,$T2		#
+	vpslldq		\$8,$T2,$T1		#
+	vpsrldq		\$8,$T2,$T2
+	vpxor		$T1,$Xi,$Xi		#
+	vpxor		$T2,$Xhi,$Xhi
+
+	vpsrlq		\$1,$Xi,$T2		# 2nd phase
+	vpxor		$Xi,$Xhi,$Xhi
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$5,$T2,$T2
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$1,$Xi,$Xi		#
+	vpxor		$Xhi,$Xi,$Xi		#
+___
+}
 
 $code.=<<___;
+.align	32
+.Linit_loop_avx:
+	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
+	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+	vmovdqa		$Xi,$T3
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+	vpshufd		\$0b01001110,$T3,$T1
+	vpshufd		\$0b01001110,$Xi,$T2
+	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
+	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
+	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
+	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
+	lea		0x30($Htbl),$Htbl
+	sub		\$1,%r10
+	jnz		.Linit_loop_avx
+
+	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
+	vmovdqu		$T3,-0x10($Htbl)
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_init_clmul
+.size	gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,\@abi-omnipotent
+.align	32
+gcm_gmult_avx:
+	jmp	.L_gmult_clmul
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,\@abi-omnipotent
+.align	32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Xi		# load $Xi
+	lea		.L0x1c2_polynomial(%rip),%r10
+	lea		0x40($Htbl),$Htbl	# size optimization
+	vmovdqu		.Lbswap_mask(%rip),$bswap
+	vpshufb		$bswap,$Xi,$Xi
+	cmp		\$0x80,$len
+	jb		.Lshort_avx
+	sub		\$0x80,$len
+
+	vmovdqu		0x70($inp),$Ii		# I[7]
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vpshufb		$bswap,$Ii,$Ii
+	vmovdqu		0x20-0x40($Htbl),$HK
+
+	vpunpckhqdq	$Ii,$Ii,$T2
+	 vmovdqu	0x60($inp),$Ij		# I[6]
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Ii,$T2,$T2
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x50($inp),$Ii		# I[5]
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpxor		$Ii,$T2,$T2
+	 vmovdqu	0x40($inp),$Ij		# I[4]
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x50-0x40($Htbl),$HK
+
+	 vpshufb	$bswap,$Ij,$Ij
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x30($inp),$Ii		# I[3]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	0x20($inp),$Ij		# I[2]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	vpxor		$Xmi,$Zmi,$Zmi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x10($inp),$Ii		# I[1]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	($inp),$Ij		# I[0]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
+
+	lea		0x80($inp),$inp
+	cmp		\$0x80,$len
+	jb		.Ltail_avx
+
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+	sub		\$0x80,$len
+	jmp		.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x70($inp),$Ii		# I[7]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpxor		$Ij,$T1,$T1
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
+	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Tred
+	 vmovdqu	0x20-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	  vmovdqu	0x60($inp),$Ij		# I[6]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Zlo,$Xi,$Xi		# collect result
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	vxorps		$Zhi,$Xo,$Xo
+	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	vpxor		$Zmi,$Tred,$Tred
+	 vxorps		$Ij,$T1,$T1
+
+	  vmovdqu	0x50($inp),$Ii		# I[5]
+	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Xo,$Tred,$Tred
+	vpslldq		\$8,$Tred,$T2
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	vpsrldq		\$8,$Tred,$Tred
+	vpxor		$T2, $Xi, $Xi
+	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	  vpshufb	$bswap,$Ii,$Ii
+	vxorps		$Tred,$Xo, $Xo
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x50-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x40($inp),$Ij		# I[4]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vxorps		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+
+	  vmovdqu	0x30($inp),$Ii		# I[3]
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x20($inp),$Ij		# I[2]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+	vxorps		$Tred,$Xi,$Xi
+
+	  vmovdqu	0x10($inp),$Ii		# I[1]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	vxorps		$Xo,$Tred,$Tred
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	($inp),$Ij		# I[0]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Tred,$Ij,$Ij
+	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+
+	lea		0x80($inp),$inp
+	sub		\$0x80,$len
+	jnc		.Loop8x_avx
+
+	add		\$0x80,$len
+	jmp		.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu		-0x10($inp,$len),$Ii	# very last word
+	lea		($inp,$len),$inp
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vmovdqu		0x20-0x40($Htbl),$HK
+	vpshufb		$bswap,$Ii,$Ij
+
+	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
+	vmovdqa		$Xhi,$Zhi		# $Zhi and
+	vmovdqa		$Xmi,$Zmi		# $Zmi
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x20($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x30($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x50-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x40($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x50($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x80-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x60($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x70($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovq		0xb8-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jmp		.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+.Ltail_no_xor_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+
+	vmovdqu		(%r10),$Tred
+
+	vpxor		$Xlo,$Zlo,$Xi
+	vpxor		$Xhi,$Zhi,$Xo
+	vpxor		$Xmi,$Zmi,$Zmi
+
+	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
+	vpxor		$Xo, $Zmi,$Zmi
+	vpslldq		\$8, $Zmi,$T2
+	vpsrldq		\$8, $Zmi,$Zmi
+	vpxor		$T2, $Xi, $Xi
+	vpxor		$Zmi,$Xo, $Xo
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$Xo,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	cmp		\$0,$len
+	jne		.Lshort_avx
+
+	vpshufb		$bswap,$Xi,$Xi
+	vmovdqu		$Xi,($Xip)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_ghash_clmul
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
 .align	64
 .Lbswap_mask:
 	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .L0x1c2_polynomial:
 	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+	.long	7,0,7,0
+.L7_mask_poly:
+	.long	7,0,`0xE1<<1`,0
 .align	64
 .type	.Lrem_4bit,\@object
 .Lrem_4bit:
@@ -774,10 +1698,24 @@ se_handler:
 	.rva	.LSEH_end_gcm_ghash_4bit
 	.rva	.LSEH_info_gcm_ghash_4bit
 
+	.rva	.LSEH_begin_gcm_init_clmul
+	.rva	.LSEH_end_gcm_init_clmul
+	.rva	.LSEH_info_gcm_init_clmul
+
 	.rva	.LSEH_begin_gcm_ghash_clmul
 	.rva	.LSEH_end_gcm_ghash_clmul
 	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___	if ($avx);
+	.rva	.LSEH_begin_gcm_init_avx
+	.rva	.LSEH_end_gcm_init_avx
+	.rva	.LSEH_info_gcm_init_clmul
 
+	.rva	.LSEH_begin_gcm_ghash_avx
+	.rva	.LSEH_end_gcm_ghash_avx
+	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_gcm_gmult_4bit:
@@ -788,14 +1726,23 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_init_clmul:
+	.byte	0x01,0x08,0x03,0x00
+	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6
+	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18
 .LSEH_info_gcm_ghash_clmul:
-	.byte	0x01,0x1f,0x0b,0x00
-	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
-	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
-	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
-	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
 ___
 }
 
diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 000000000..e76a58c34
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	lis		r0,0xfff0
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$H,$H,$t1		# twisted H
+
+	vsldoi		$H,$H,$H,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	stvx_u		$H, r9,r3
+	stvx_u		$Hh,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo�Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi�Xi.lo+H.lo�Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi�Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subi		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	b		Loop
+
+.align	5
+Loop:
+	 subic		$len,$len,16
+	vpmsumd		$Xl,$IN,$Hl		# H.lo�Xi.lo
+	 subfe.		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H		# H.hi�Xi.lo+H.lo�Xi.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$Hh		# H.hi�Xi.hi
+	 add		$inp,$inp,r0
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,0,$inp
+	 addi		$inp,$inp,16
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	beq		Loop			# did $len-=16 borrow?
+
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl
new file mode 100755
index 000000000..54a1ac4db
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+#		PMULL[2]	32-bit NEON(*)
+# Apple A7	1.76		5.62
+# Cortex-A53	1.45		8.39
+# Cortex-A57	2.22		7.61
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	vld1.64		{$t1},[x1]		@ load H
+	vmov.i8		$t0,#0xe1
+	vext.8		$IN,$t1,$t1,#8
+	vshl.i64	$t0,$t0,#57
+	vshr.u64	$t2,$t0,#63
+	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01
+	vdup.32		$t1,${t1}[1]
+	vshr.u64	$t3,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t3,$t3,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t3,$t3,$t3,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t3		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vst1.64		{$IN},[x0]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$t3,#0xe1
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	vshl.u64	$t3,$t3,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$Hhl,$H,$H,#8
+	mov		$len,#0
+	vext.8		$IN,$t1,$t1,#8
+	mov		$inc,#0
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	mov		$inp,$Xi
+	b		.Lgmult_v8
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	subs		$len,$len,#16
+	vmov.i8		$t3,#0xe1
+	mov		$inc,#16
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	cclr		$inc,eq
+	vext.8		$Xl,$Xl,$Xl,#8
+	vshl.u64	$t3,$t3,#57
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp
+	vext.8		$Hhl,$H,$H,#8
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$t1,$t1
+#endif
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	vext.8		$IN,$t1,$t1,#8
+	b		.Loop_v8
+
+.align	4
+.Loop_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+	vpmull.p64	$Xl,$H,$IN		@ H.lo�Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi�Xi.hi
+	subs		$len,$len,#16
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	cclr		$inc,eq
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$t3		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	veor		$Xl,$Xm,$t2
+	 vext.8		$IN,$t1,$t1,#8
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$t3
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	b.hs		.Loop_v8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+        s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+        print $_,"\n";
+    }
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/cbc128.c b/openssl/crypto/modes/cbc128.c
index 0e54f7547..c13caea53 100644
--- a/openssl/crypto/modes/cbc128.c
+++ b/openssl/crypto/modes/cbc128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -59,147 +59,149 @@
 #endif
 #include <assert.h>
 
-#ifndef STRICT_ALIGNMENT
-#  define STRICT_ALIGNMENT 0
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
+# define STRICT_ALIGNMENT 0
 #endif
 
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
 {
-	size_t n;
-	const unsigned char *iv = ivec;
+    size_t n;
+    const unsigned char *iv = ivec;
 
-	assert(in && out && key && ivec);
+    assert(in && out && key && ivec);
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (STRICT_ALIGNMENT &&
-	    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-		while (len>=16) {
-			for(n=0; n<16; ++n)
-				out[n] = in[n] ^ iv[n];
-			(*block)(out, out, key);
-			iv = out;
-			len -= 16;
-			in  += 16;
-			out += 16;
-		}
-	} else {
-		while (len>=16) {
-			for(n=0; n<16; n+=sizeof(size_t))
-				*(size_t*)(out+n) =
-				*(size_t*)(in+n) ^ *(size_t*)(iv+n);
-			(*block)(out, out, key);
-			iv = out;
-			len -= 16;
-			in  += 16;
-			out += 16;
-		}
-	}
+    if (STRICT_ALIGNMENT &&
+        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+        while (len >= 16) {
+            for (n = 0; n < 16; ++n)
+                out[n] = in[n] ^ iv[n];
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    } else {
+        while (len >= 16) {
+            for (n = 0; n < 16; n += sizeof(size_t))
+                *(size_t *)(out + n) =
+                    *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    }
 #endif
-	while (len) {
-		for(n=0; n<16 && n<len; ++n)
-			out[n] = in[n] ^ iv[n];
-		for(; n<16; ++n)
-			out[n] = iv[n];
-		(*block)(out, out, key);
-		iv = out;
-		if (len<=16) break;
-		len -= 16;
-		in  += 16;
-		out += 16;
-	}
-	memcpy(ivec,iv,16);
+    while (len) {
+        for (n = 0; n < 16 && n < len; ++n)
+            out[n] = in[n] ^ iv[n];
+        for (; n < 16; ++n)
+            out[n] = iv[n];
+        (*block) (out, out, key);
+        iv = out;
+        if (len <= 16)
+            break;
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
+    memcpy(ivec, iv, 16);
 }
 
 void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
 {
-	size_t n;
-	union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
+    size_t n;
+    union {
+        size_t t[16 / sizeof(size_t)];
+        unsigned char c[16];
+    } tmp;
 
-	assert(in && out && key && ivec);
+    assert(in && out && key && ivec);
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (in != out) {
-		const unsigned char *iv = ivec;
+    if (in != out) {
+        const unsigned char *iv = ivec;
 
-		if (STRICT_ALIGNMENT &&
-		    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-			while (len>=16) {
-				(*block)(in, out, key);
-				for(n=0; n<16; ++n)
-					out[n] ^= iv[n];
-				iv = in;
-				len -= 16;
-				in  += 16;
-				out += 16;
-			}
-		}
-		else  if (16%sizeof(size_t) == 0) { /* always true */
-			while (len>=16) {
-				size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            while (len >= 16) {
+                (*block) (in, out, key);
+                for (n = 0; n < 16; ++n)
+                    out[n] ^= iv[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
 
-				(*block)(in, out, key);
-				for(n=0; n<16/sizeof(size_t); n++)
-					out_t[n] ^= iv_t[n];
-				iv = in;
-				len -= 16;
-				in  += 16;
-				out += 16;
-			}
-		}
-		memcpy(ivec,iv,16);
-	} else {
-		if (STRICT_ALIGNMENT &&
-		    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-			unsigned char c;
-			while (len>=16) {
-				(*block)(in, tmp.c, key);
-				for(n=0; n<16; ++n) {
-					c = in[n];
-					out[n] = tmp.c[n] ^ ivec[n];
-					ivec[n] = c;
-				}
-				len -= 16;
-				in  += 16;
-				out += 16;
-			}
-		}
-		else if (16%sizeof(size_t) == 0) { /* always true */
-			while (len>=16) {
-				size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
-				const size_t *in_t=(const size_t *)in;
+                (*block) (in, out, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++)
+                    out_t[n] ^= iv_t[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+        memcpy(ivec, iv, 16);
+    } else {
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            unsigned char c;
+            while (len >= 16) {
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16; ++n) {
+                    c = in[n];
+                    out[n] = tmp.c[n] ^ ivec[n];
+                    ivec[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
+                const size_t *in_t = (const size_t *)in;
 
-				(*block)(in, tmp.c, key);
-				for(n=0; n<16/sizeof(size_t); n++) {
-					c = in_t[n];
-					out_t[n] = tmp.t[n] ^ ivec_t[n];
-					ivec_t[n] = c;
-				}
-				len -= 16;
-				in  += 16;
-				out += 16;
-			}
-		}
-	}
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++) {
+                    c = in_t[n];
+                    out_t[n] = tmp.t[n] ^ ivec_t[n];
+                    ivec_t[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+    }
 #endif
-	while (len) {
-		unsigned char c;
-		(*block)(in, tmp.c, key);
-		for(n=0; n<16 && n<len; ++n) {
-			c = in[n];
-			out[n] = tmp.c[n] ^ ivec[n];
-			ivec[n] = c;
-		}
-		if (len<=16) {
-			for (; n<16; ++n)
-				ivec[n] = in[n];
-			break;
-		}
-		len -= 16;
-		in  += 16;
-		out += 16;
-	}
+    while (len) {
+        unsigned char c;
+        (*block) (in, tmp.c, key);
+        for (n = 0; n < 16 && n < len; ++n) {
+            c = in[n];
+            out[n] = tmp.c[n] ^ ivec[n];
+            ivec[n] = c;
+        }
+        if (len <= 16) {
+            for (; n < 16; ++n)
+                ivec[n] = in[n];
+            break;
+        }
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
 }
diff --git a/openssl/crypto/modes/ccm128.c b/openssl/crypto/modes/ccm128.c
index 3ce11d0d9..c1ded0f91 100644
--- a/openssl/crypto/modes/ccm128.c
+++ b/openssl/crypto/modes/ccm128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -58,384 +58,422 @@
 #endif
 #include <assert.h>
 
-/* First you setup M and L parameters and pass the key schedule.
- * This is called once per session setup... */
+/*
+ * First you setup M and L parameters and pass the key schedule. This is
+ * called once per session setup...
+ */
 void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-	unsigned int M,unsigned int L,void *key,block128_f block)
+                        unsigned int M, unsigned int L, void *key,
+                        block128_f block)
 {
-	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
-	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
-	ctx->blocks = 0;
-	ctx->block = block;
-	ctx->key = key;
+    memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
+    ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
+    ctx->blocks = 0;
+    ctx->block = block;
+    ctx->key = key;
 }
 
 /* !!! Following interfaces are to be called *once* per packet !!! */
 
 /* Then you setup per-message nonce and pass the length of the message */
 int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-	const unsigned char *nonce,size_t nlen,size_t mlen)
+                        const unsigned char *nonce, size_t nlen, size_t mlen)
 {
-	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */
+    unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
 
-	if (nlen<(14-L)) return -1;		/* nonce is too short */
+    if (nlen < (14 - L))
+        return -1;              /* nonce is too short */
 
-	if (sizeof(mlen)==8 && L>=3) {
-		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
-		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
-		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
-		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
-	}
-	else
-		ctx->nonce.u[1] = 0;
+    if (sizeof(mlen) == 8 && L >= 3) {
+        ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
+    } else
+        ctx->nonce.u[1] = 0;
 
-	ctx->nonce.c[12] = (u8)(mlen>>24);
-	ctx->nonce.c[13] = (u8)(mlen>>16);
-	ctx->nonce.c[14] = (u8)(mlen>>8);
-	ctx->nonce.c[15] = (u8)mlen;
+    ctx->nonce.c[12] = (u8)(mlen >> 24);
+    ctx->nonce.c[13] = (u8)(mlen >> 16);
+    ctx->nonce.c[14] = (u8)(mlen >> 8);
+    ctx->nonce.c[15] = (u8)mlen;
 
-	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
-	memcpy(&ctx->nonce.c[1],nonce,14-L);
+    ctx->nonce.c[0] &= ~0x40;   /* clear Adata flag */
+    memcpy(&ctx->nonce.c[1], nonce, 14 - L);
 
-	return 0;
+    return 0;
 }
 
 /* Then you pass additional authentication data, this is optional */
 void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-	const unsigned char *aad,size_t alen)
-{	unsigned int i;
-	block128_f block = ctx->block;
-
-	if (alen==0) return;
-
-	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
-	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
-	ctx->blocks++;
-
-	if (alen<(0x10000-0x100)) {
-		ctx->cmac.c[0] ^= (u8)(alen>>8);
-		ctx->cmac.c[1] ^= (u8)alen;
-		i=2;
-	}
-	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
-		ctx->cmac.c[0] ^= 0xFF;
-		ctx->cmac.c[1] ^= 0xFF;
-		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
-		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
-		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
-		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
-		ctx->cmac.c[6] ^= (u8)(alen>>24);
-		ctx->cmac.c[7] ^= (u8)(alen>>16);
-		ctx->cmac.c[8] ^= (u8)(alen>>8);
-		ctx->cmac.c[9] ^= (u8)alen;
-		i=10;
-	}
-	else {
-		ctx->cmac.c[0] ^= 0xFF;
-		ctx->cmac.c[1] ^= 0xFE;
-		ctx->cmac.c[2] ^= (u8)(alen>>24);
-		ctx->cmac.c[3] ^= (u8)(alen>>16);
-		ctx->cmac.c[4] ^= (u8)(alen>>8);
-		ctx->cmac.c[5] ^= (u8)alen;
-		i=6;
-	}
-
-	do {
-		for(;i<16 && alen;++i,++aad,--alen)
-			ctx->cmac.c[i] ^= *aad;
-		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
-		ctx->blocks++;
-		i=0;
-	} while (alen);
+                       const unsigned char *aad, size_t alen)
+{
+    unsigned int i;
+    block128_f block = ctx->block;
+
+    if (alen == 0)
+        return;
+
+    ctx->nonce.c[0] |= 0x40;    /* set Adata flag */
+    (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+
+    if (alen < (0x10000 - 0x100)) {
+        ctx->cmac.c[0] ^= (u8)(alen >> 8);
+        ctx->cmac.c[1] ^= (u8)alen;
+        i = 2;
+    } else if (sizeof(alen) == 8
+               && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
+        ctx->cmac.c[0] ^= 0xFF;
+        ctx->cmac.c[1] ^= 0xFF;
+        ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
+        ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
+        ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
+        ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
+        ctx->cmac.c[6] ^= (u8)(alen >> 24);
+        ctx->cmac.c[7] ^= (u8)(alen >> 16);
+        ctx->cmac.c[8] ^= (u8)(alen >> 8);
+        ctx->cmac.c[9] ^= (u8)alen;
+        i = 10;
+    } else {
+        ctx->cmac.c[0] ^= 0xFF;
+        ctx->cmac.c[1] ^= 0xFE;
+        ctx->cmac.c[2] ^= (u8)(alen >> 24);
+        ctx->cmac.c[3] ^= (u8)(alen >> 16);
+        ctx->cmac.c[4] ^= (u8)(alen >> 8);
+        ctx->cmac.c[5] ^= (u8)alen;
+        i = 6;
+    }
+
+    do {
+        for (; i < 16 && alen; ++i, ++aad, --alen)
+            ctx->cmac.c[i] ^= *aad;
+        (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+        i = 0;
+    } while (alen);
 }
 
 /* Finally you encrypt or decrypt the message */
 
-/* counter part of nonce may not be larger than L*8 bits,
- * L is not larger than 8, therefore 64-bit counter... */
-static void ctr64_inc(unsigned char *counter) {
-	unsigned int n=8;
-	u8  c;
-
-	counter += 8;
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c) return;
-	} while (n);
+/*
+ * counter part of nonce may not be larger than L*8 bits, L is not larger
+ * than 8, therefore 64-bit counter...
+ */
+static void ctr64_inc(unsigned char *counter)
+{
+    unsigned int n = 8;
+    u8 c;
+
+    counter += 8;
+    do {
+        --n;
+        c = counter[n];
+        ++c;
+        counter[n] = c;
+        if (c)
+            return;
+    } while (n);
 }
 
 int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out,
-	size_t len)
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len)
 {
-	size_t		n;
-	unsigned int	i,L;
-	unsigned char	flags0	= ctx->nonce.c[0];
-	block128_f	block	= ctx->block;
-	void *		key	= ctx->key;
-	union { u64 u[2]; u8 c[16]; } scratch;
-
-	if (!(flags0&0x40))
-		(*block)(ctx->nonce.c,ctx->cmac.c,key),
-		ctx->blocks++;
-
-	ctx->nonce.c[0] = L = flags0&7;
-	for (n=0,i=15-L;i<15;++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i]=0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15]=1;
-
-	if (n!=len) return -1;	/* length mismatch */
-
-	ctx->blocks += ((len+15)>>3)|1;
-	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
-
-	while (len>=16) {
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;              /* length mismatch */
+
+    ctx->blocks += ((len + 15) >> 3) | 1;
+    if (ctx->blocks > (U64(1) << 61))
+        return -2;              /* too much data */
+
+    while (len >= 16) {
 #if defined(STRICT_ALIGNMENT)
-		union { u64 u[2]; u8 c[16]; } temp;
-
-		memcpy (temp.c,inp,16);
-		ctx->cmac.u[0] ^= temp.u[0];
-		ctx->cmac.u[1] ^= temp.u[1];
+        union {
+            u64 u[2];
+            u8 c[16];
+        } temp;
+
+        memcpy(temp.c, inp, 16);
+        ctx->cmac.u[0] ^= temp.u[0];
+        ctx->cmac.u[1] ^= temp.u[1];
 #else
-		ctx->cmac.u[0] ^= ((u64*)inp)[0];
-		ctx->cmac.u[1] ^= ((u64*)inp)[1];
+        ctx->cmac.u[0] ^= ((u64 *)inp)[0];
+        ctx->cmac.u[1] ^= ((u64 *)inp)[1];
 #endif
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
-		(*block)(ctx->nonce.c,scratch.c,key);
-		ctr64_inc(ctx->nonce.c);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        ctr64_inc(ctx->nonce.c);
 #if defined(STRICT_ALIGNMENT)
-		temp.u[0] ^= scratch.u[0];
-		temp.u[1] ^= scratch.u[1];
-		memcpy(out,temp.c,16);
+        temp.u[0] ^= scratch.u[0];
+        temp.u[1] ^= scratch.u[1];
+        memcpy(out, temp.c, 16);
 #else
-		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
-		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+        ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
+        ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
 #endif
-		inp += 16;
-		out += 16;
-		len -= 16;
-	}
-
-	if (len) {
-		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
-		(*block)(ctx->nonce.c,scratch.c,key);
-		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
-	}
-
-	for (i=15-L;i<16;++i)
-		ctx->nonce.c[i]=0;
-
-	(*block)(ctx->nonce.c,scratch.c,key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
+        inp += 16;
+        out += 16;
+        len -= 16;
+    }
+
+    if (len) {
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= inp[i];
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            out[i] = scratch.c[i] ^ inp[i];
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
 }
 
 int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out,
-	size_t len)
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len)
 {
-	size_t		n;
-	unsigned int	i,L;
-	unsigned char	flags0	= ctx->nonce.c[0];
-	block128_f	block	= ctx->block;
-	void *		key	= ctx->key;
-	union { u64 u[2]; u8 c[16]; } scratch;
-
-	if (!(flags0&0x40))
-		(*block)(ctx->nonce.c,ctx->cmac.c,key);
-
-	ctx->nonce.c[0] = L = flags0&7;
-	for (n=0,i=15-L;i<15;++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i]=0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15]=1;
-
-	if (n!=len) return -1;
-
-	while (len>=16) {
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;
+
+    while (len >= 16) {
 #if defined(STRICT_ALIGNMENT)
-		union { u64 u[2]; u8 c[16]; } temp;
+        union {
+            u64 u[2];
+            u8 c[16];
+        } temp;
 #endif
-		(*block)(ctx->nonce.c,scratch.c,key);
-		ctr64_inc(ctx->nonce.c);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        ctr64_inc(ctx->nonce.c);
 #if defined(STRICT_ALIGNMENT)
-		memcpy (temp.c,inp,16);
-		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
-		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
-		memcpy (out,scratch.c,16);
+        memcpy(temp.c, inp, 16);
+        ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+        ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+        memcpy(out, scratch.c, 16);
 #else
-		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
-		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+        ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]);
+        ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]);
 #endif
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
 
-		inp += 16;
-		out += 16;
-		len -= 16;
-	}
+        inp += 16;
+        out += 16;
+        len -= 16;
+    }
 
-	if (len) {
-		(*block)(ctx->nonce.c,scratch.c,key);
-		for (i=0; i<len; ++i)
-			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
-	}
+    if (len) {
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+    }
 
-	for (i=15-L;i<16;++i)
-		ctx->nonce.c[i]=0;
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
 
-	(*block)(ctx->nonce.c,scratch.c,key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
 
-	ctx->nonce.c[0] = flags0;
+    ctx->nonce.c[0] = flags0;
 
-	return 0;
+    return 0;
 }
 
-static void ctr64_add (unsigned char *counter,size_t inc)
-{	size_t n=8, val=0;
-
-	counter += 8;
-	do {
-		--n;
-		val += counter[n] + (inc&0xff);
-		counter[n] = (unsigned char)val;
-		val >>= 8;	/* carry bit */
-		inc >>= 8;
-	} while(n && (inc || val));
+static void ctr64_add(unsigned char *counter, size_t inc)
+{
+    size_t n = 8, val = 0;
+
+    counter += 8;
+    do {
+        --n;
+        val += counter[n] + (inc & 0xff);
+        counter[n] = (unsigned char)val;
+        val >>= 8;              /* carry bit */
+        inc >>= 8;
+    } while (n && (inc || val));
 }
 
 int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out,
-	size_t len,ccm128_f stream)
+                                const unsigned char *inp, unsigned char *out,
+                                size_t len, ccm128_f stream)
 {
-	size_t		n;
-	unsigned int	i,L;
-	unsigned char	flags0	= ctx->nonce.c[0];
-	block128_f	block	= ctx->block;
-	void *		key	= ctx->key;
-	union { u64 u[2]; u8 c[16]; } scratch;
-
-	if (!(flags0&0x40))
-		(*block)(ctx->nonce.c,ctx->cmac.c,key),
-		ctx->blocks++;
-
-	ctx->nonce.c[0] = L = flags0&7;
-	for (n=0,i=15-L;i<15;++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i]=0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15]=1;
-
-	if (n!=len) return -1;	/* length mismatch */
-
-	ctx->blocks += ((len+15)>>3)|1;
-	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
-
-	if ((n=len/16)) {
-		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
-		n   *= 16;
-		inp += n;
-		out += n;
-		len -= n;
-		if (len) ctr64_add(ctx->nonce.c,n/16);
-	}
-
-	if (len) {
-		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
-		(*block)(ctx->nonce.c,scratch.c,key);
-		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
-	}
-
-	for (i=15-L;i<16;++i)
-		ctx->nonce.c[i]=0;
-
-	(*block)(ctx->nonce.c,scratch.c,key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;              /* length mismatch */
+
+    ctx->blocks += ((len + 15) >> 3) | 1;
+    if (ctx->blocks > (U64(1) << 61))
+        return -2;              /* too much data */
+
+    if ((n = len / 16)) {
+        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+        n *= 16;
+        inp += n;
+        out += n;
+        len -= n;
+        if (len)
+            ctr64_add(ctx->nonce.c, n / 16);
+    }
+
+    if (len) {
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= inp[i];
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            out[i] = scratch.c[i] ^ inp[i];
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
 }
 
 int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out,
-	size_t len,ccm128_f stream)
+                                const unsigned char *inp, unsigned char *out,
+                                size_t len, ccm128_f stream)
 {
-	size_t		n;
-	unsigned int	i,L;
-	unsigned char	flags0	= ctx->nonce.c[0];
-	block128_f	block	= ctx->block;
-	void *		key	= ctx->key;
-	union { u64 u[2]; u8 c[16]; } scratch;
-
-	if (!(flags0&0x40))
-		(*block)(ctx->nonce.c,ctx->cmac.c,key);
-
-	ctx->nonce.c[0] = L = flags0&7;
-	for (n=0,i=15-L;i<15;++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i]=0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15]=1;
-
-	if (n!=len) return -1;
-
-	if ((n=len/16)) {
-		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
-		n   *= 16;
-		inp += n;
-		out += n;
-		len -= n;
-		if (len) ctr64_add(ctx->nonce.c,n/16);
-	}
-
-	if (len) {
-		(*block)(ctx->nonce.c,scratch.c,key);
-		for (i=0; i<len; ++i)
-			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
-		(*block)(ctx->cmac.c,ctx->cmac.c,key);
-	}
-
-	for (i=15-L;i<16;++i)
-		ctx->nonce.c[i]=0;
-
-	(*block)(ctx->nonce.c,scratch.c,key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;
+
+    if ((n = len / 16)) {
+        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+        n *= 16;
+        inp += n;
+        out += n;
+        len -= n;
+        if (len)
+            ctr64_add(ctx->nonce.c, n / 16);
+    }
+
+    if (len) {
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
 }
 
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
-{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */
-
-	M *= 2; M += 2;
-	if (len<M)	return 0;
-	memcpy(tag,ctx->cmac.c,M);
-	return M;
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+    unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
+
+    M *= 2;
+    M += 2;
+    if (len < M)
+        return 0;
+    memcpy(tag, ctx->cmac.c, M);
+    return M;
 }
diff --git a/openssl/crypto/modes/cfb128.c b/openssl/crypto/modes/cfb128.c
index 4e6f5d35e..d4ecbd08e 100644
--- a/openssl/crypto/modes/cfb128.c
+++ b/openssl/crypto/modes/cfb128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -59,14 +59,15 @@
 #endif
 #include <assert.h>
 
-/* The input and output encrypted as though 128bit cfb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
+/*
+ * The input and output encrypted as though 128bit cfb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
  */
 void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block)
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num,
+                           int enc, block128_f block)
 {
     unsigned int n;
     size_t l = 0;
@@ -77,166 +78,177 @@ void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
 
     if (enc) {
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do {	/* always true actually */
-		while (n && len) {
-			*(out++) = ivec[n] ^= *(in++);
-			--len;
-			n = (n+1) % 16;
-		}
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-			break;
-#endif
-		while (len>=16) {
-			(*block)(ivec, ivec, key);
-			for (; n<16; n+=sizeof(size_t)) {
-				*(size_t*)(out+n) =
-				*(size_t*)(ivec+n) ^= *(size_t*)(in+n);
-			}
-			len -= 16;
-			out += 16;
-			in  += 16;
-			n = 0;
-		}
-		if (len) {
-			(*block)(ivec, ivec, key);
-			while (len--) {
-				out[n] = ivec[n] ^= in[n];
-				++n;
-			}
-		}
-		*num = n;
-		return;
-	} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
+        if (16 % sizeof(size_t) == 0) { /* always true actually */
+            do {
+                while (n && len) {
+                    *(out++) = ivec[n] ^= *(in++);
+                    --len;
+                    n = (n + 1) % 16;
+                }
+# if defined(STRICT_ALIGNMENT)
+                if (((size_t)in | (size_t)out | (size_t)ivec) %
+                    sizeof(size_t) != 0)
+                    break;
+# endif
+                while (len >= 16) {
+                    (*block) (ivec, ivec, key);
+                    for (; n < 16; n += sizeof(size_t)) {
+                        *(size_t *)(out + n) =
+                            *(size_t *)(ivec + n) ^= *(size_t *)(in + n);
+                    }
+                    len -= 16;
+                    out += 16;
+                    in += 16;
+                    n = 0;
+                }
+                if (len) {
+                    (*block) (ivec, ivec, key);
+                    while (len--) {
+                        out[n] = ivec[n] ^= in[n];
+                        ++n;
+                    }
+                }
+                *num = n;
+                return;
+            } while (0);
+        }
+        /* the rest would be commonly eliminated by x86* compiler */
 #endif
-	while (l<len) {
-		if (n == 0) {
-			(*block)(ivec, ivec, key);
-		}
-		out[l] = ivec[n] ^= in[l];
-		++l;
-		n = (n+1) % 16;
-	}
-	*num = n;
+        while (l < len) {
+            if (n == 0) {
+                (*block) (ivec, ivec, key);
+            }
+            out[l] = ivec[n] ^= in[l];
+            ++l;
+            n = (n + 1) % 16;
+        }
+        *num = n;
     } else {
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do {	/* always true actually */
-		while (n && len) {
-			unsigned char c;
-			*(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
-			--len;
-			n = (n+1) % 16;
- 		}
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-			break;
-#endif
-		while (len>=16) {
-			(*block)(ivec, ivec, key);
-			for (; n<16; n+=sizeof(size_t)) {
-				size_t t = *(size_t*)(in+n);
-				*(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
-				*(size_t*)(ivec+n) = t;
-			}
-			len -= 16;
-			out += 16;
-			in  += 16;
-			n = 0;
-		}
-		if (len) {
-			(*block)(ivec, ivec, key);
-			while (len--) {
-				unsigned char c;
-				out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
-				++n;
-			}
- 		}
-		*num = n;
-		return;
-	} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
+        if (16 % sizeof(size_t) == 0) { /* always true actually */
+            do {
+                while (n && len) {
+                    unsigned char c;
+                    *(out++) = ivec[n] ^ (c = *(in++));
+                    ivec[n] = c;
+                    --len;
+                    n = (n + 1) % 16;
+                }
+# if defined(STRICT_ALIGNMENT)
+                if (((size_t)in | (size_t)out | (size_t)ivec) %
+                    sizeof(size_t) != 0)
+                    break;
+# endif
+                while (len >= 16) {
+                    (*block) (ivec, ivec, key);
+                    for (; n < 16; n += sizeof(size_t)) {
+                        size_t t = *(size_t *)(in + n);
+                        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
+                        *(size_t *)(ivec + n) = t;
+                    }
+                    len -= 16;
+                    out += 16;
+                    in += 16;
+                    n = 0;
+                }
+                if (len) {
+                    (*block) (ivec, ivec, key);
+                    while (len--) {
+                        unsigned char c;
+                        out[n] = ivec[n] ^ (c = in[n]);
+                        ivec[n] = c;
+                        ++n;
+                    }
+                }
+                *num = n;
+                return;
+            } while (0);
+        }
+        /* the rest would be commonly eliminated by x86* compiler */
 #endif
-	while (l<len) {
-		unsigned char c;
-		if (n == 0) {
-			(*block)(ivec, ivec, key);
-		}
-		out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
-		++l;
-		n = (n+1) % 16;
-	}
-	*num=n;
+        while (l < len) {
+            unsigned char c;
+            if (n == 0) {
+                (*block) (ivec, ivec, key);
+            }
+            out[l] = ivec[n] ^ (c = in[l]);
+            ivec[n] = c;
+            ++l;
+            n = (n + 1) % 16;
+        }
+        *num = n;
     }
 }
 
-/* This expects a single block of size nbits for both in and out. Note that
-   it corrupts any extra bits in the last byte of out */
-static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
-			    int nbits,const void *key,
-			    unsigned char ivec[16],int enc,
-			    block128_f block)
+/*
+ * This expects a single block of size nbits for both in and out. Note that
+ * it corrupts any extra bits in the last byte of out
+ */
+static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
+                               int nbits, const void *key,
+                               unsigned char ivec[16], int enc,
+                               block128_f block)
 {
-    int n,rem,num;
-    unsigned char ovec[16*2 + 1];  /* +1 because we dererefence (but don't use) one byte off the end */
-
-    if (nbits<=0 || nbits>128) return;
-
-	/* fill in the first half of the new IV with the current IV */
-	memcpy(ovec,ivec,16);
-	/* construct the new IV */
-	(*block)(ivec,ivec,key);
-	num = (nbits+7)/8;
-	if (enc)	/* encrypt the input */
-	    for(n=0 ; n < num ; ++n)
-		out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
-	else		/* decrypt the input */
-	    for(n=0 ; n < num ; ++n)
-		out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
-	/* shift ovec left... */
-	rem = nbits%8;
-	num = nbits/8;
-	if(rem==0)
-	    memcpy(ivec,ovec+num,16);
-	else
-	    for(n=0 ; n < 16 ; ++n)
-		ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
+    int n, rem, num;
+    unsigned char ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't
+                                     * use) one byte off the end */
+
+    if (nbits <= 0 || nbits > 128)
+        return;
+
+    /* fill in the first half of the new IV with the current IV */
+    memcpy(ovec, ivec, 16);
+    /* construct the new IV */
+    (*block) (ivec, ivec, key);
+    num = (nbits + 7) / 8;
+    if (enc)                    /* encrypt the input */
+        for (n = 0; n < num; ++n)
+            out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+    else                        /* decrypt the input */
+        for (n = 0; n < num; ++n)
+            out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+    /* shift ovec left... */
+    rem = nbits % 8;
+    num = nbits / 8;
+    if (rem == 0)
+        memcpy(ivec, ovec + num, 16);
+    else
+        for (n = 0; n < 16; ++n)
+            ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
 
     /* it is not necessary to cleanse ovec, since the IV is not secret */
 }
 
 /* N.B. This expects the input to be packed, MS bit first */
 void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-		 	size_t bits, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block)
+                             size_t bits, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block)
 {
     size_t n;
-    unsigned char c[1],d[1];
+    unsigned char c[1], d[1];
 
     assert(in && out && key && ivec && num);
     assert(*num == 0);
 
-    for(n=0 ; n<bits ; ++n)
-	{
-	c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
-	cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
-	out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
-		 ((d[0]&0x80) >> (unsigned int)(n%8));
-	}
+    for (n = 0; n < bits; ++n) {
+        c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+        cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+        out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+            ((d[0] & 0x80) >> (unsigned int)(n % 8));
+    }
 }
 
 void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-			size_t length, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block)
+                             size_t length, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block)
 {
     size_t n;
 
     assert(in && out && key && ivec && num);
     assert(*num == 0);
 
-    for(n=0 ; n<length ; ++n)
-	cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
+    for (n = 0; n < length; ++n)
+        cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
 }
-
diff --git a/openssl/crypto/modes/ctr128.c b/openssl/crypto/modes/ctr128.c
index ee642c586..f3bbcbf72 100644
--- a/openssl/crypto/modes/ctr128.c
+++ b/openssl/crypto/modes/ctr128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -59,194 +59,212 @@
 #endif
 #include <assert.h>
 
-/* NOTE: the IV/counter CTR mode is big-endian.  The code itself
- * is endian-neutral. */
+/*
+ * NOTE: the IV/counter CTR mode is big-endian.  The code itself is
+ * endian-neutral.
+ */
 
 /* increment counter (128-bit int) by 1 */
-static void ctr128_inc(unsigned char *counter) {
-	u32 n=16;
-	u8  c;
-
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c) return;
-	} while (n);
+static void ctr128_inc(unsigned char *counter)
+{
+    u32 n = 16;
+    u8 c;
+
+    do {
+        --n;
+        c = counter[n];
+        ++c;
+        counter[n] = c;
+        if (c)
+            return;
+    } while (n);
 }
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-static void ctr128_inc_aligned(unsigned char *counter) {
-	size_t *data,c,n;
-	const union { long one; char little; } is_endian = {1};
-
-	if (is_endian.little) {
-		ctr128_inc(counter);
-		return;
-	}
-
-	data = (size_t *)counter;
-	n = 16/sizeof(size_t);
-	do {
-		--n;
-		c = data[n];
-		++c;
-		data[n] = c;
-		if (c) return;
-	} while (n);
+static void ctr128_inc_aligned(unsigned char *counter)
+{
+    size_t *data, c, n;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+
+    if (is_endian.little) {
+        ctr128_inc(counter);
+        return;
+    }
+
+    data = (size_t *)counter;
+    n = 16 / sizeof(size_t);
+    do {
+        --n;
+        c = data[n];
+        ++c;
+        data[n] = c;
+        if (c)
+            return;
+    } while (n);
 }
 #endif
 
-/* The input encrypted as though 128bit counter mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num, and the
- * encrypted counter is kept in ecount_buf.  Both *num and
- * ecount_buf must be initialised with zeros before the first
- * call to CRYPTO_ctr128_encrypt().
- *
- * This algorithm assumes that the counter is in the x lower bits
- * of the IV (ivec), and that the application has full control over
- * overflow and the rest of the IV.  This implementation takes NO
- * responsability for checking that the counter doesn't overflow
- * into the rest of the IV when incremented.
+/*
+ * The input encrypted as though 128bit counter mode is being used.  The
+ * extra state information to record how much of the 128bit block we have
+ * used is contained in *num, and the encrypted counter is kept in
+ * ecount_buf.  Both *num and ecount_buf must be initialised with zeros
+ * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
+ * that the counter is in the x lower bits of the IV (ivec), and that the
+ * application has full control over overflow and the rest of the IV.  This
+ * implementation takes NO responsability for checking that the counter
+ * doesn't overflow into the rest of the IV when incremented.
  */
 void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], unsigned char ecount_buf[16],
-			unsigned int *num, block128_f block)
+                           size_t len, const void *key,
+                           unsigned char ivec[16],
+                           unsigned char ecount_buf[16], unsigned int *num,
+                           block128_f block)
 {
-	unsigned int n;
-	size_t l=0;
+    unsigned int n;
+    size_t l = 0;
 
-	assert(in && out && key && ecount_buf && num);
-	assert(*num < 16);
+    assert(in && out && key && ecount_buf && num);
+    assert(*num < 16);
 
-	n = *num;
+    n = *num;
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do { /* always true actually */
-		while (n && len) {
-			*(out++) = *(in++) ^ ecount_buf[n];
-			--len;
-			n = (n+1) % 16;
-		}
-
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-			break;
-#endif
-		while (len>=16) {
-			(*block)(ivec, ecount_buf, key);
-			ctr128_inc_aligned(ivec);
-			for (; n<16; n+=sizeof(size_t))
-				*(size_t *)(out+n) =
-				*(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
-			len -= 16;
-			out += 16;
-			in  += 16;
-			n = 0;
-		}
-		if (len) {
-			(*block)(ivec, ecount_buf, key);
- 			ctr128_inc_aligned(ivec);
-			while (len--) {
-				out[n] = in[n] ^ ecount_buf[n];
-				++n;
-			}
-		}
-		*num = n;
-		return;
-	} while(0);
-	/* the rest would be commonly eliminated by x86* compiler */
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            while (n && len) {
+                *(out++) = *(in++) ^ ecount_buf[n];
+                --len;
+                n = (n + 1) % 16;
+            }
+
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+                0)
+                break;
+# endif
+            while (len >= 16) {
+                (*block) (ivec, ecount_buf, key);
+                ctr128_inc_aligned(ivec);
+                for (; n < 16; n += sizeof(size_t))
+                    *(size_t *)(out + n) =
+                        *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
+                len -= 16;
+                out += 16;
+                in += 16;
+                n = 0;
+            }
+            if (len) {
+                (*block) (ivec, ecount_buf, key);
+                ctr128_inc_aligned(ivec);
+                while (len--) {
+                    out[n] = in[n] ^ ecount_buf[n];
+                    ++n;
+                }
+            }
+            *num = n;
+            return;
+        } while (0);
+    }
+    /* the rest would be commonly eliminated by x86* compiler */
 #endif
-	while (l<len) {
-		if (n==0) {
-			(*block)(ivec, ecount_buf, key);
- 			ctr128_inc(ivec);
-		}
-		out[l] = in[l] ^ ecount_buf[n];
-		++l;
-		n = (n+1) % 16;
-	}
-
-	*num=n;
+    while (l < len) {
+        if (n == 0) {
+            (*block) (ivec, ecount_buf, key);
+            ctr128_inc(ivec);
+        }
+        out[l] = in[l] ^ ecount_buf[n];
+        ++l;
+        n = (n + 1) % 16;
+    }
+
+    *num = n;
 }
 
 /* increment upper 96 bits of 128-bit counter by 1 */
-static void ctr96_inc(unsigned char *counter) {
-	u32 n=12;
-	u8  c;
-
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c) return;
-	} while (n);
+static void ctr96_inc(unsigned char *counter)
+{
+    u32 n = 12;
+    u8 c;
+
+    do {
+        --n;
+        c = counter[n];
+        ++c;
+        counter[n] = c;
+        if (c)
+            return;
+    } while (n);
 }
 
 void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], unsigned char ecount_buf[16],
-			unsigned int *num, ctr128_f func)
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16],
+                                 unsigned char ecount_buf[16],
+                                 unsigned int *num, ctr128_f func)
 {
-	unsigned int n,ctr32;
-
-	assert(in && out && key && ecount_buf && num);
-	assert(*num < 16);
-
-	n = *num;
-
-	while (n && len) {
-		*(out++) = *(in++) ^ ecount_buf[n];
-		--len;
-		n = (n+1) % 16;
-	}
-
-	ctr32 = GETU32(ivec+12);
-	while (len>=16) {
-		size_t blocks = len/16;
-		/*
-		 * 1<<28 is just a not-so-small yet not-so-large number...
-		 * Below condition is practically never met, but it has to
-		 * be checked for code correctness.
-		 */
-		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
-			blocks = (1U<<28);
-		/*
-		 * As (*func) operates on 32-bit counter, caller
-		 * has to handle overflow. 'if' below detects the
-		 * overflow, which is then handled by limiting the
-		 * amount of blocks to the exact overflow point...
-		 */
-		ctr32 += (u32)blocks;
-		if (ctr32 < blocks) {
-			blocks -= ctr32;
-			ctr32   = 0;
-		}
-		(*func)(in,out,blocks,key,ivec);
-		/* (*ctr) does not update ivec, caller does: */
-		PUTU32(ivec+12,ctr32);
-		/* ... overflow was detected, propogate carry. */
-		if (ctr32 == 0)	ctr96_inc(ivec);
-		blocks *= 16;
-		len -= blocks;
-		out += blocks;
-		in  += blocks;
-	}
-	if (len) {
-		memset(ecount_buf,0,16);
-		(*func)(ecount_buf,ecount_buf,1,key,ivec);
-		++ctr32;
-		PUTU32(ivec+12,ctr32);
-		if (ctr32 == 0)	ctr96_inc(ivec);
-		while (len--) {
-			out[n] = in[n] ^ ecount_buf[n];
-			++n;
-		}
-	}
-
-	*num=n;
+    unsigned int n, ctr32;
+
+    assert(in && out && key && ecount_buf && num);
+    assert(*num < 16);
+
+    n = *num;
+
+    while (n && len) {
+        *(out++) = *(in++) ^ ecount_buf[n];
+        --len;
+        n = (n + 1) % 16;
+    }
+
+    ctr32 = GETU32(ivec + 12);
+    while (len >= 16) {
+        size_t blocks = len / 16;
+        /*
+         * 1<<28 is just a not-so-small yet not-so-large number...
+         * Below condition is practically never met, but it has to
+         * be checked for code correctness.
+         */
+        if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
+            blocks = (1U << 28);
+        /*
+         * As (*func) operates on 32-bit counter, caller
+         * has to handle overflow. 'if' below detects the
+         * overflow, which is then handled by limiting the
+         * amount of blocks to the exact overflow point...
+         */
+        ctr32 += (u32)blocks;
+        if (ctr32 < blocks) {
+            blocks -= ctr32;
+            ctr32 = 0;
+        }
+        (*func) (in, out, blocks, key, ivec);
+        /* (*ctr) does not update ivec, caller does: */
+        PUTU32(ivec + 12, ctr32);
+        /* ... overflow was detected, propogate carry. */
+        if (ctr32 == 0)
+            ctr96_inc(ivec);
+        blocks *= 16;
+        len -= blocks;
+        out += blocks;
+        in += blocks;
+    }
+    if (len) {
+        memset(ecount_buf, 0, 16);
+        (*func) (ecount_buf, ecount_buf, 1, key, ivec);
+        ++ctr32;
+        PUTU32(ivec + 12, ctr32);
+        if (ctr32 == 0)
+            ctr96_inc(ivec);
+        while (len--) {
+            out[n] = in[n] ^ ecount_buf[n];
+            ++n;
+        }
+    }
+
+    *num = n;
 }
diff --git a/openssl/crypto/modes/cts128.c b/openssl/crypto/modes/cts128.c
index 2d583de6f..137be595a 100644
--- a/openssl/crypto/modes/cts128.c
+++ b/openssl/crypto/modes/cts128.c
@@ -29,425 +29,516 @@
  * compliant with the NIST proposal, both extending CBC mode.
  */
 
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
-{	size_t residue, n;
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block)
+{
+    size_t residue, n;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len <= 16) return 0;
+    if (len <= 16)
+        return 0;
 
-	if ((residue=len%16) == 0) residue = 16;
+    if ((residue = len % 16) == 0)
+        residue = 16;
 
-	len -= residue;
+    len -= residue;
 
-	CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
 
-	in  += len;
-	out += len;
+    in += len;
+    out += len;
 
-	for (n=0; n<residue; ++n)
-		ivec[n] ^= in[n];
-	(*block)(ivec,ivec,key);
-	memcpy(out,out-16,residue);
-	memcpy(out-16,ivec,16); 
+    for (n = 0; n < residue; ++n)
+        ivec[n] ^= in[n];
+    (*block) (ivec, ivec, key);
+    memcpy(out, out - 16, residue);
+    memcpy(out - 16, ivec, 16);
 
-	return len+residue;
+    return len + residue;
 }
 
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
-{	size_t residue, n;
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block)
+{
+    size_t residue, n;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len < 16) return 0;
+    if (len < 16)
+        return 0;
 
-	residue=len%16;
+    residue = len % 16;
 
-	len -= residue;
+    len -= residue;
 
-	CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
 
-	if (residue==0)	return len;
+    if (residue == 0)
+        return len;
 
-	in  += len;
-	out += len;
+    in += len;
+    out += len;
 
-	for (n=0; n<residue; ++n)
-		ivec[n] ^= in[n];
-	(*block)(ivec,ivec,key);
-	memcpy(out-16+residue,ivec,16);
+    for (n = 0; n < residue; ++n)
+        ivec[n] ^= in[n];
+    (*block) (ivec, ivec, key);
+    memcpy(out - 16 + residue, ivec, 16);
 
-	return len+residue;
+    return len + residue;
 }
 
 size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc)
-{	size_t residue;
-	union { size_t align; unsigned char c[16]; } tmp;
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[16];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len <= 16) return 0;
+    if (len <= 16)
+        return 0;
 
-	if ((residue=len%16) == 0) residue = 16;
+    if ((residue = len % 16) == 0)
+        residue = 16;
 
-	len -= residue;
+    len -= residue;
 
-	(*cbc)(in,out,len,key,ivec,1);
+    (*cbc) (in, out, len, key, ivec, 1);
 
-	in  += len;
-	out += len;
+    in += len;
+    out += len;
 
 #if defined(CBC_HANDLES_TRUNCATED_IO)
-	memcpy(tmp.c,out-16,16);
-	(*cbc)(in,out-16,residue,key,ivec,1);
-	memcpy(out,tmp.c,residue);
+    memcpy(tmp.c, out - 16, 16);
+    (*cbc) (in, out - 16, residue, key, ivec, 1);
+    memcpy(out, tmp.c, residue);
 #else
-	memset(tmp.c,0,sizeof(tmp));
-	memcpy(tmp.c,in,residue);
-	memcpy(out,out-16,residue);
-	(*cbc)(tmp.c,out-16,16,key,ivec,1);
+    memset(tmp.c, 0, sizeof(tmp));
+    memcpy(tmp.c, in, residue);
+    memcpy(out, out - 16, residue);
+    (*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
 #endif
-	return len+residue;
+    return len + residue;
 }
 
 size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc)
-{	size_t residue;
-	union { size_t align; unsigned char c[16]; } tmp;
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[16];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len < 16) return 0;
+    if (len < 16)
+        return 0;
 
-	residue=len%16;
+    residue = len % 16;
 
-	len -= residue;
+    len -= residue;
 
-	(*cbc)(in,out,len,key,ivec,1);
+    (*cbc) (in, out, len, key, ivec, 1);
 
-	if (residue==0) return len;
+    if (residue == 0)
+        return len;
 
-	in  += len;
-	out += len;
+    in += len;
+    out += len;
 
 #if defined(CBC_HANDLES_TRUNCATED_IO)
-	(*cbc)(in,out-16+residue,residue,key,ivec,1);
+    (*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
 #else
-	memset(tmp.c,0,sizeof(tmp));
-	memcpy(tmp.c,in,residue);
-	(*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
+    memset(tmp.c, 0, sizeof(tmp));
+    memcpy(tmp.c, in, residue);
+    (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
 #endif
-	return len+residue;
+    return len + residue;
 }
 
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
-{	size_t residue, n;
-	union { size_t align; unsigned char c[32]; } tmp;
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block)
+{
+    size_t residue, n;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len<=16) return 0;
+    if (len <= 16)
+        return 0;
 
-	if ((residue=len%16) == 0) residue = 16;
+    if ((residue = len % 16) == 0)
+        residue = 16;
 
-	len -= 16+residue;
+    len -= 16 + residue;
 
-	if (len) {
-		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-		in  += len;
-		out += len;
-	}
+    if (len) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        in += len;
+        out += len;
+    }
 
-	(*block)(in,tmp.c+16,key);
+    (*block) (in, tmp.c + 16, key);
 
-	memcpy(tmp.c,tmp.c+16,16);
-	memcpy(tmp.c,in+16,residue);
-	(*block)(tmp.c,tmp.c,key);
+    memcpy(tmp.c, tmp.c + 16, 16);
+    memcpy(tmp.c, in + 16, residue);
+    (*block) (tmp.c, tmp.c, key);
 
-	for(n=0; n<16; ++n) {
-		unsigned char c = in[n];
-		out[n] = tmp.c[n] ^ ivec[n];
-		ivec[n] = c;
-	}
-	for(residue+=16; n<residue; ++n)
-		out[n] = tmp.c[n] ^ in[n];
+    for (n = 0; n < 16; ++n) {
+        unsigned char c = in[n];
+        out[n] = tmp.c[n] ^ ivec[n];
+        ivec[n] = c;
+    }
+    for (residue += 16; n < residue; ++n)
+        out[n] = tmp.c[n] ^ in[n];
 
-	return 16+len+residue;
+    return 16 + len + residue;
 }
 
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block)
-{	size_t residue, n;
-	union { size_t align; unsigned char c[32]; } tmp;
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block)
+{
+    size_t residue, n;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len<16) return 0;
+    if (len < 16)
+        return 0;
 
-	residue=len%16;
+    residue = len % 16;
 
-	if (residue==0) {
-		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-		return len;
-	}
+    if (residue == 0) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        return len;
+    }
 
-	len -= 16+residue;
+    len -= 16 + residue;
 
-	if (len) {
-		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-		in  += len;
-		out += len;
-	}
+    if (len) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        in += len;
+        out += len;
+    }
 
-	(*block)(in+residue,tmp.c+16,key);
+    (*block) (in + residue, tmp.c + 16, key);
 
-	memcpy(tmp.c,tmp.c+16,16);
-	memcpy(tmp.c,in,residue);
-	(*block)(tmp.c,tmp.c,key);
+    memcpy(tmp.c, tmp.c + 16, 16);
+    memcpy(tmp.c, in, residue);
+    (*block) (tmp.c, tmp.c, key);
 
-	for(n=0; n<16; ++n) {
-		unsigned char c = in[n];
-		out[n] = tmp.c[n] ^ ivec[n];
-		ivec[n] = in[n+residue];
-		tmp.c[n] = c;
-	}
-	for(residue+=16; n<residue; ++n)
-		out[n] = tmp.c[n] ^ tmp.c[n-16];
+    for (n = 0; n < 16; ++n) {
+        unsigned char c = in[n];
+        out[n] = tmp.c[n] ^ ivec[n];
+        ivec[n] = in[n + residue];
+        tmp.c[n] = c;
+    }
+    for (residue += 16; n < residue; ++n)
+        out[n] = tmp.c[n] ^ tmp.c[n - 16];
 
-	return 16+len+residue;
+    return 16 + len + residue;
 }
 
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc)
-{	size_t residue;
-	union { size_t align; unsigned char c[32]; } tmp;
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len<=16) return 0;
+    if (len <= 16)
+        return 0;
 
-	if ((residue=len%16) == 0) residue = 16;
+    if ((residue = len % 16) == 0)
+        residue = 16;
 
-	len -= 16+residue;
+    len -= 16 + residue;
 
-	if (len) {
-		(*cbc)(in,out,len,key,ivec,0);
-		in  += len;
-		out += len;
-	}
+    if (len) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        in += len;
+        out += len;
+    }
 
-	memset(tmp.c,0,sizeof(tmp));
-	/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
-	(*cbc)(in,tmp.c,16,key,tmp.c+16,0);
+    memset(tmp.c, 0, sizeof(tmp));
+    /*
+     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+     */
+    (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
 
-	memcpy(tmp.c,in+16,residue);
+    memcpy(tmp.c, in + 16, residue);
 #if defined(CBC_HANDLES_TRUNCATED_IO)
-	(*cbc)(tmp.c,out,16+residue,key,ivec,0);
+    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
 #else
-	(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
-	memcpy(out,tmp.c,16+residue);
+    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+    memcpy(out, tmp.c, 16 + residue);
 #endif
-	return 16+len+residue;
+    return 16 + len + residue;
 }
 
 size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc)
-{	size_t residue;
-	union { size_t align; unsigned char c[32]; } tmp;
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
 
-	assert (in && out && key && ivec);
+    assert(in && out && key && ivec);
 
-	if (len<16) return 0;
+    if (len < 16)
+        return 0;
 
-	residue=len%16;
+    residue = len % 16;
 
-	if (residue==0) {
-		(*cbc)(in,out,len,key,ivec,0);
-		return len;
-	}
+    if (residue == 0) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        return len;
+    }
 
-	len -= 16+residue;
+    len -= 16 + residue;
 
-	if (len) {
-		(*cbc)(in,out,len,key,ivec,0);
-		in  += len;
-		out += len;
-	}
+    if (len) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        in += len;
+        out += len;
+    }
 
-	memset(tmp.c,0,sizeof(tmp));
-	/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
-	(*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
+    memset(tmp.c, 0, sizeof(tmp));
+    /*
+     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+     */
+    (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
 
-	memcpy(tmp.c,in,residue);
+    memcpy(tmp.c, in, residue);
 #if defined(CBC_HANDLES_TRUNCATED_IO)
-	(*cbc)(tmp.c,out,16+residue,key,ivec,0);
+    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
 #else
-	(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
-	memcpy(out,tmp.c,16+residue);
+    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+    memcpy(out, tmp.c, 16 + residue);
 #endif
-	return 16+len+residue;
+    return 16 + len + residue;
 }
 
 #if defined(SELFTEST)
-#include <stdio.h>
-#include <openssl/aes.h>
+# include <stdio.h>
+# include <openssl/aes.h>
 
 /* test vectors from RFC 3962 */
 static const unsigned char test_key[16] = "chicken teriyaki";
 static const unsigned char test_input[64] =
-		"I would like the" " General Gau's C"
-		"hicken, please, " "and wonton soup.";
-static const unsigned char test_iv[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-static const unsigned char vector_17[17] =
-{0xc6,0x35,0x35,0x68,0xf2,0xbf,0x8c,0xb4, 0xd8,0xa5,0x80,0x36,0x2d,0xa7,0xff,0x7f,
- 0x97};
-static const unsigned char vector_31[31] =
-{0xfc,0x00,0x78,0x3e,0x0e,0xfd,0xb2,0xc1, 0xd4,0x45,0xd4,0xc8,0xef,0xf7,0xed,0x22,
- 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5};
-static const unsigned char vector_32[32] =
-{0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
- 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84};
-static const unsigned char vector_47[47] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0xb3,0xff,0xfd,0x94,0x0c,0x16,0xa1,0x8c, 0x1b,0x55,0x49,0xd2,0xf8,0x38,0x02,0x9e,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5};
-static const unsigned char vector_48[48] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8};
-static const unsigned char vector_64[64] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
- 0x48,0x07,0xef,0xe8,0x36,0xee,0x89,0xa5, 0x26,0x73,0x0d,0xbc,0x2f,0x7b,0xc8,0x40,
- 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8};
+    "I would like the" " General Gau's C"
+    "hicken, please, " "and wonton soup.";
+static const unsigned char test_iv[16] =
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static const unsigned char vector_17[17] = {
+    0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4,
+    0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f,
+    0x97
+};
+
+static const unsigned char vector_31[31] = {
+    0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1,
+    0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22,
+    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5
+};
+
+static const unsigned char vector_32[32] = {
+    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84
+};
+
+static const unsigned char vector_47[47] = {
+    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+    0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c,
+    0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e,
+    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5
+};
+
+static const unsigned char vector_48[48] = {
+    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+    0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+    0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8,
+    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8
+};
+
+static const unsigned char vector_64[64] = {
+    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+    0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5,
+    0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40,
+    0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+    0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8
+};
 
 static AES_KEY encks, decks;
 
-void test_vector(const unsigned char *vector,size_t len)
-{	unsigned char iv[sizeof(test_iv)];
-	unsigned char cleartext[64],ciphertext[64];
-	size_t tail;
-
-	printf("vector_%d\n",len); fflush(stdout);
-
-	if ((tail=len%16) == 0) tail = 16;
-	tail += 16;
-
-	/* test block-based encryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_cts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
-	if (memcmp(ciphertext,vector,len))
-		fprintf(stderr,"output_%d mismatch\n",len), exit(1);
-	if (memcmp(iv,vector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
-
-	/* test block-based decryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_cts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
-	if (memcmp(cleartext,test_input,len))
-		fprintf(stderr,"input_%d mismatch\n",len), exit(2);
-	if (memcmp(iv,vector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
-
-	/* test streamed encryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_cts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
-	if (memcmp(ciphertext,vector,len))
-		fprintf(stderr,"output_%d mismatch\n",len), exit(3);
-	if (memcmp(iv,vector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
-
-	/* test streamed decryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_cts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
-	if (memcmp(cleartext,test_input,len))
-		fprintf(stderr,"input_%d mismatch\n",len), exit(4);
-	if (memcmp(iv,vector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+void test_vector(const unsigned char *vector, size_t len)
+{
+    unsigned char iv[sizeof(test_iv)];
+    unsigned char cleartext[64], ciphertext[64];
+    size_t tail;
+
+    printf("vector_%d\n", len);
+    fflush(stdout);
+
+    if ((tail = len % 16) == 0)
+        tail = 16;
+    tail += 16;
+
+    /* test block-based encryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_cts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+                                (block128_f) AES_encrypt);
+    if (memcmp(ciphertext, vector, len))
+        fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+    if (memcmp(iv, vector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+    /* test block-based decryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_cts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+                                (block128_f) AES_decrypt);
+    if (memcmp(cleartext, test_input, len))
+        fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+    if (memcmp(iv, vector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+    /* test streamed encryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_cts128_encrypt(test_input, ciphertext, len, &encks, iv,
+                          (cbc128_f) AES_cbc_encrypt);
+    if (memcmp(ciphertext, vector, len))
+        fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+    if (memcmp(iv, vector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+    /* test streamed decryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_cts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+                          (cbc128_f) AES_cbc_encrypt);
+    if (memcmp(cleartext, test_input, len))
+        fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+    if (memcmp(iv, vector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
 }
 
-void test_nistvector(const unsigned char *vector,size_t len)
-{	unsigned char iv[sizeof(test_iv)];
-	unsigned char cleartext[64],ciphertext[64],nistvector[64];
-	size_t tail;
-
-	printf("nistvector_%d\n",len); fflush(stdout);
-
-	if ((tail=len%16) == 0) tail = 16;
-
-	len -= 16 + tail;
-	memcpy(nistvector,vector,len);
-	/* flip two last blocks */
-	memcpy(nistvector+len,vector+len+16,tail);
-	memcpy(nistvector+len+tail,vector+len,16);
-	len += 16 + tail;
-	tail = 16;
-
-	/* test block-based encryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
-	if (memcmp(ciphertext,nistvector,len))
-		fprintf(stderr,"output_%d mismatch\n",len), exit(1);
-	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
-
-	/* test block-based decryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
-	if (memcmp(cleartext,test_input,len))
-		fprintf(stderr,"input_%d mismatch\n",len), exit(2);
-	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
-
-	/* test streamed encryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
-	if (memcmp(ciphertext,nistvector,len))
-		fprintf(stderr,"output_%d mismatch\n",len), exit(3);
-	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
-
-	/* test streamed decryption */
-	memcpy(iv,test_iv,sizeof(test_iv));
-	CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
-	if (memcmp(cleartext,test_input,len))
-		fprintf(stderr,"input_%d mismatch\n",len), exit(4);
-	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
-		fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+void test_nistvector(const unsigned char *vector, size_t len)
+{
+    unsigned char iv[sizeof(test_iv)];
+    unsigned char cleartext[64], ciphertext[64], nistvector[64];
+    size_t tail;
+
+    printf("nistvector_%d\n", len);
+    fflush(stdout);
+
+    if ((tail = len % 16) == 0)
+        tail = 16;
+
+    len -= 16 + tail;
+    memcpy(nistvector, vector, len);
+    /* flip two last blocks */
+    memcpy(nistvector + len, vector + len + 16, tail);
+    memcpy(nistvector + len + tail, vector + len, 16);
+    len += 16 + tail;
+    tail = 16;
+
+    /* test block-based encryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_nistcts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+                                    (block128_f) AES_encrypt);
+    if (memcmp(ciphertext, nistvector, len))
+        fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+    if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+    /* test block-based decryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_nistcts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+                                    (block128_f) AES_decrypt);
+    if (memcmp(cleartext, test_input, len))
+        fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+    if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+    /* test streamed encryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_nistcts128_encrypt(test_input, ciphertext, len, &encks, iv,
+                              (cbc128_f) AES_cbc_encrypt);
+    if (memcmp(ciphertext, nistvector, len))
+        fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+    if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+    /* test streamed decryption */
+    memcpy(iv, test_iv, sizeof(test_iv));
+    CRYPTO_nistcts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+                              (cbc128_f) AES_cbc_encrypt);
+    if (memcmp(cleartext, test_input, len))
+        fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+    if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+        fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
 }
 
 int main()
 {
-	AES_set_encrypt_key(test_key,128,&encks);
-	AES_set_decrypt_key(test_key,128,&decks);
-
-	test_vector(vector_17,sizeof(vector_17));
-	test_vector(vector_31,sizeof(vector_31));
-	test_vector(vector_32,sizeof(vector_32));
-	test_vector(vector_47,sizeof(vector_47));
-	test_vector(vector_48,sizeof(vector_48));
-	test_vector(vector_64,sizeof(vector_64));
-
-	test_nistvector(vector_17,sizeof(vector_17));
-	test_nistvector(vector_31,sizeof(vector_31));
-	test_nistvector(vector_32,sizeof(vector_32));
-	test_nistvector(vector_47,sizeof(vector_47));
-	test_nistvector(vector_48,sizeof(vector_48));
-	test_nistvector(vector_64,sizeof(vector_64));
-
-	return 0;
+    AES_set_encrypt_key(test_key, 128, &encks);
+    AES_set_decrypt_key(test_key, 128, &decks);
+
+    test_vector(vector_17, sizeof(vector_17));
+    test_vector(vector_31, sizeof(vector_31));
+    test_vector(vector_32, sizeof(vector_32));
+    test_vector(vector_47, sizeof(vector_47));
+    test_vector(vector_48, sizeof(vector_48));
+    test_vector(vector_64, sizeof(vector_64));
+
+    test_nistvector(vector_17, sizeof(vector_17));
+    test_nistvector(vector_31, sizeof(vector_31));
+    test_nistvector(vector_32, sizeof(vector_32));
+    test_nistvector(vector_47, sizeof(vector_47));
+    test_nistvector(vector_48, sizeof(vector_48));
+    test_nistvector(vector_64, sizeof(vector_64));
+
+    return 0;
 }
 #endif
diff --git a/openssl/crypto/modes/gcm128.c b/openssl/crypto/modes/gcm128.c
index e1dc2b0f4..4debf537f 100644
--- a/openssl/crypto/modes/gcm128.c
+++ b/openssl/crypto/modes/gcm128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -62,27 +62,27 @@
 
 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
 /* redefine, because alignment is ensured */
-#undef	GETU32
-#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
-#undef	PUTU32
-#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
-#endif
-
-#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
-#define REDUCE1BIT(V)	do { \
-	if (sizeof(size_t)==8) { \
-		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
-		V.lo  = (V.hi<<63)|(V.lo>>1); \
-		V.hi  = (V.hi>>1 )^T; \
-	} \
-	else { \
-		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
-		V.lo  = (V.hi<<63)|(V.lo>>1); \
-		V.hi  = (V.hi>>1 )^((u64)T<<32); \
-	} \
+# undef  GETU32
+# define GETU32(p)       BSWAP4(*(const u32 *)(p))
+# undef  PUTU32
+# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)   do { \
+        if (sizeof(size_t)==8) { \
+                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+                V.lo  = (V.hi<<63)|(V.lo>>1); \
+                V.hi  = (V.hi>>1 )^T; \
+        } \
+        else { \
+                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+                V.lo  = (V.hi<<63)|(V.lo>>1); \
+                V.hi  = (V.hi>>1 )^((u64)T<<32); \
+        } \
 } while(0)
 
-/*
+/*-
  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  * never be set to 8. 8 is effectively reserved for testing purposes.
  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
@@ -116,286 +116,311 @@
  *
  * Value of 1 is not appropriate for performance reasons.
  */
-#if	TABLE_BITS==8
+#if     TABLE_BITS==8
 
 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
 {
-	int  i, j;
-	u128 V;
-
-	Htable[0].hi = 0;
-	Htable[0].lo = 0;
-	V.hi = H[0];
-	V.lo = H[1];
-
-	for (Htable[128]=V, i=64; i>0; i>>=1) {
-		REDUCE1BIT(V);
-		Htable[i] = V;
-	}
-
-	for (i=2; i<256; i<<=1) {
-		u128 *Hi = Htable+i, H0 = *Hi;
-		for (j=1; j<i; ++j) {
-			Hi[j].hi = H0.hi^Htable[j].hi;
-			Hi[j].lo = H0.lo^Htable[j].lo;
-		}
-	}
+    int i, j;
+    u128 V;
+
+    Htable[0].hi = 0;
+    Htable[0].lo = 0;
+    V.hi = H[0];
+    V.lo = H[1];
+
+    for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
+        REDUCE1BIT(V);
+        Htable[i] = V;
+    }
+
+    for (i = 2; i < 256; i <<= 1) {
+        u128 *Hi = Htable + i, H0 = *Hi;
+        for (j = 1; j < i; ++j) {
+            Hi[j].hi = H0.hi ^ Htable[j].hi;
+            Hi[j].lo = H0.lo ^ Htable[j].lo;
+        }
+    }
 }
 
 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
 {
-	u128 Z = { 0, 0};
-	const u8 *xi = (const u8 *)Xi+15;
-	size_t rem, n = *xi;
-	const union { long one; char little; } is_endian = {1};
-	static const size_t rem_8bit[256] = {
-		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
-		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
-		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
-		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
-		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
-		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
-		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
-		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
-		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
-		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
-		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
-		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
-		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
-		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
-		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
-		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
-		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
-		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
-		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
-		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
-		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
-		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
-		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
-		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
-		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
-		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
-		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
-		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
-		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
-		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
-		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
-		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
-		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
-		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
-		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
-		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
-		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
-		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
-		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
-		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
-		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
-		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
-		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
-		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
-		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
-		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
-		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
-		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
-		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
-		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
-		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
-		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
-		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
-		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
-		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
-		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
-		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
-		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
-		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
-		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
-		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
-		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
-		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
-		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
-
-	while (1) {
-		Z.hi ^= Htable[n].hi;
-		Z.lo ^= Htable[n].lo;
-
-		if ((u8 *)Xi==xi)	break;
-
-		n = *(--xi);
-
-		rem  = (size_t)Z.lo&0xff;
-		Z.lo = (Z.hi<<56)|(Z.lo>>8);
-		Z.hi = (Z.hi>>8);
-		if (sizeof(size_t)==8)
-			Z.hi ^= rem_8bit[rem];
-		else
-			Z.hi ^= (u64)rem_8bit[rem]<<32;
-	}
-
-	if (is_endian.little) {
-#ifdef BSWAP8
-		Xi[0] = BSWAP8(Z.hi);
-		Xi[1] = BSWAP8(Z.lo);
-#else
-		u8 *p = (u8 *)Xi;
-		u32 v;
-		v = (u32)(Z.hi>>32);	PUTU32(p,v);
-		v = (u32)(Z.hi);	PUTU32(p+4,v);
-		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
-		v = (u32)(Z.lo);	PUTU32(p+12,v);
-#endif
-	}
-	else {
-		Xi[0] = Z.hi;
-		Xi[1] = Z.lo;
-	}
+    u128 Z = { 0, 0 };
+    const u8 *xi = (const u8 *)Xi + 15;
+    size_t rem, n = *xi;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    static const size_t rem_8bit[256] = {
+        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+        PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+        PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+        PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+        PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+        PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+        PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+        PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+        PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+        PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+        PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+        PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+        PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+        PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+        PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+        PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+        PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+        PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+        PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+        PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+        PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+        PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+        PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+        PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+        PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+        PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+        PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+        PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+        PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+        PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+        PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+        PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+        PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+        PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+        PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+        PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+        PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+        PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+        PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+        PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+        PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+        PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+        PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+        PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+        PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+        PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+        PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+        PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+        PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+        PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+        PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+        PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+        PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+        PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+        PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+        PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+        PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+        PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+        PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+        PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+        PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+        PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+        PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
+    };
+
+    while (1) {
+        Z.hi ^= Htable[n].hi;
+        Z.lo ^= Htable[n].lo;
+
+        if ((u8 *)Xi == xi)
+            break;
+
+        n = *(--xi);
+
+        rem = (size_t)Z.lo & 0xff;
+        Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+        Z.hi = (Z.hi >> 8);
+        if (sizeof(size_t) == 8)
+            Z.hi ^= rem_8bit[rem];
+        else
+            Z.hi ^= (u64)rem_8bit[rem] << 32;
+    }
+
+    if (is_endian.little) {
+# ifdef BSWAP8
+        Xi[0] = BSWAP8(Z.hi);
+        Xi[1] = BSWAP8(Z.lo);
+# else
+        u8 *p = (u8 *)Xi;
+        u32 v;
+        v = (u32)(Z.hi >> 32);
+        PUTU32(p, v);
+        v = (u32)(Z.hi);
+        PUTU32(p + 4, v);
+        v = (u32)(Z.lo >> 32);
+        PUTU32(p + 8, v);
+        v = (u32)(Z.lo);
+        PUTU32(p + 12, v);
+# endif
+    } else {
+        Xi[0] = Z.hi;
+        Xi[1] = Z.lo;
+    }
 }
-#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 
-#elif	TABLE_BITS==4
+# define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif   TABLE_BITS==4
 
 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 {
-	u128 V;
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-	int  i;
-#endif
+    u128 V;
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+    int i;
+# endif
 
-	Htable[0].hi = 0;
-	Htable[0].lo = 0;
-	V.hi = H[0];
-	V.lo = H[1];
-
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-	for (Htable[8]=V, i=4; i>0; i>>=1) {
-		REDUCE1BIT(V);
-		Htable[i] = V;
-	}
-
-	for (i=2; i<16; i<<=1) {
-		u128 *Hi = Htable+i;
-		int   j;
-		for (V=*Hi, j=1; j<i; ++j) {
-			Hi[j].hi = V.hi^Htable[j].hi;
-			Hi[j].lo = V.lo^Htable[j].lo;
-		}
-	}
-#else
-	Htable[8] = V;
-	REDUCE1BIT(V);
-	Htable[4] = V;
-	REDUCE1BIT(V);
-	Htable[2] = V;
-	REDUCE1BIT(V);
-	Htable[1] = V;
-	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
-	V=Htable[4];
-	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
-	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
-	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
-	V=Htable[8];
-	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
-	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
-	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
-	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
-	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
-	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
-	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
-#endif
-#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
-	/*
-	 * ARM assembler expects specific dword order in Htable.
-	 */
-	{
-	int j;
-	const union { long one; char little; } is_endian = {1};
-
-	if (is_endian.little)
-		for (j=0;j<16;++j) {
-			V = Htable[j];
-			Htable[j].hi = V.lo;
-			Htable[j].lo = V.hi;
-		}
-	else
-		for (j=0;j<16;++j) {
-			V = Htable[j];
-			Htable[j].hi = V.lo<<32|V.lo>>32;
-			Htable[j].lo = V.hi<<32|V.hi>>32;
-		}
-	}
-#endif
+    Htable[0].hi = 0;
+    Htable[0].lo = 0;
+    V.hi = H[0];
+    V.lo = H[1];
+
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+    for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
+        REDUCE1BIT(V);
+        Htable[i] = V;
+    }
+
+    for (i = 2; i < 16; i <<= 1) {
+        u128 *Hi = Htable + i;
+        int j;
+        for (V = *Hi, j = 1; j < i; ++j) {
+            Hi[j].hi = V.hi ^ Htable[j].hi;
+            Hi[j].lo = V.lo ^ Htable[j].lo;
+        }
+    }
+# else
+    Htable[8] = V;
+    REDUCE1BIT(V);
+    Htable[4] = V;
+    REDUCE1BIT(V);
+    Htable[2] = V;
+    REDUCE1BIT(V);
+    Htable[1] = V;
+    Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
+    V = Htable[4];
+    Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
+    Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
+    Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
+    V = Htable[8];
+    Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
+    Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
+    Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
+    Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
+    Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
+    Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
+    Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
+# endif
+# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+    /*
+     * ARM assembler expects specific dword order in Htable.
+     */
+    {
+        int j;
+        const union {
+            long one;
+            char little;
+        } is_endian = {
+            1
+        };
+
+        if (is_endian.little)
+            for (j = 0; j < 16; ++j) {
+                V = Htable[j];
+                Htable[j].hi = V.lo;
+                Htable[j].lo = V.hi;
+        } else
+            for (j = 0; j < 16; ++j) {
+                V = Htable[j];
+                Htable[j].hi = V.lo << 32 | V.lo >> 32;
+                Htable[j].lo = V.hi << 32 | V.hi >> 32;
+            }
+    }
+# endif
 }
 
-#ifndef GHASH_ASM
+# ifndef GHASH_ASM
 static const size_t rem_4bit[16] = {
-	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
-	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
-	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
-	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
+};
 
 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 {
-	u128 Z;
-	int cnt = 15;
-	size_t rem, nlo, nhi;
-	const union { long one; char little; } is_endian = {1};
-
-	nlo  = ((const u8 *)Xi)[15];
-	nhi  = nlo>>4;
-	nlo &= 0xf;
-
-	Z.hi = Htable[nlo].hi;
-	Z.lo = Htable[nlo].lo;
-
-	while (1) {
-		rem  = (size_t)Z.lo&0xf;
-		Z.lo = (Z.hi<<60)|(Z.lo>>4);
-		Z.hi = (Z.hi>>4);
-		if (sizeof(size_t)==8)
-			Z.hi ^= rem_4bit[rem];
-		else
-			Z.hi ^= (u64)rem_4bit[rem]<<32;
-
-		Z.hi ^= Htable[nhi].hi;
-		Z.lo ^= Htable[nhi].lo;
-
-		if (--cnt<0)		break;
-
-		nlo  = ((const u8 *)Xi)[cnt];
-		nhi  = nlo>>4;
-		nlo &= 0xf;
-
-		rem  = (size_t)Z.lo&0xf;
-		Z.lo = (Z.hi<<60)|(Z.lo>>4);
-		Z.hi = (Z.hi>>4);
-		if (sizeof(size_t)==8)
-			Z.hi ^= rem_4bit[rem];
-		else
-			Z.hi ^= (u64)rem_4bit[rem]<<32;
-
-		Z.hi ^= Htable[nlo].hi;
-		Z.lo ^= Htable[nlo].lo;
-	}
-
-	if (is_endian.little) {
-#ifdef BSWAP8
-		Xi[0] = BSWAP8(Z.hi);
-		Xi[1] = BSWAP8(Z.lo);
-#else
-		u8 *p = (u8 *)Xi;
-		u32 v;
-		v = (u32)(Z.hi>>32);	PUTU32(p,v);
-		v = (u32)(Z.hi);	PUTU32(p+4,v);
-		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
-		v = (u32)(Z.lo);	PUTU32(p+12,v);
-#endif
-	}
-	else {
-		Xi[0] = Z.hi;
-		Xi[1] = Z.lo;
-	}
+    u128 Z;
+    int cnt = 15;
+    size_t rem, nlo, nhi;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+
+    nlo = ((const u8 *)Xi)[15];
+    nhi = nlo >> 4;
+    nlo &= 0xf;
+
+    Z.hi = Htable[nlo].hi;
+    Z.lo = Htable[nlo].lo;
+
+    while (1) {
+        rem = (size_t)Z.lo & 0xf;
+        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+        Z.hi = (Z.hi >> 4);
+        if (sizeof(size_t) == 8)
+            Z.hi ^= rem_4bit[rem];
+        else
+            Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+        Z.hi ^= Htable[nhi].hi;
+        Z.lo ^= Htable[nhi].lo;
+
+        if (--cnt < 0)
+            break;
+
+        nlo = ((const u8 *)Xi)[cnt];
+        nhi = nlo >> 4;
+        nlo &= 0xf;
+
+        rem = (size_t)Z.lo & 0xf;
+        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+        Z.hi = (Z.hi >> 4);
+        if (sizeof(size_t) == 8)
+            Z.hi ^= rem_4bit[rem];
+        else
+            Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+        Z.hi ^= Htable[nlo].hi;
+        Z.lo ^= Htable[nlo].lo;
+    }
+
+    if (is_endian.little) {
+#  ifdef BSWAP8
+        Xi[0] = BSWAP8(Z.hi);
+        Xi[1] = BSWAP8(Z.lo);
+#  else
+        u8 *p = (u8 *)Xi;
+        u32 v;
+        v = (u32)(Z.hi >> 32);
+        PUTU32(p, v);
+        v = (u32)(Z.hi);
+        PUTU32(p + 4, v);
+        v = (u32)(Z.lo >> 32);
+        PUTU32(p + 8, v);
+        v = (u32)(Z.lo);
+        PUTU32(p + 12, v);
+#  endif
+    } else {
+        Xi[0] = Z.hi;
+        Xi[1] = Z.lo;
+    }
 }
 
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
+#  if !defined(OPENSSL_SMALL_FOOTPRINT)
 /*
  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
  * details... Compiler-generated code doesn't seem to give any
@@ -403,1503 +428,1936 @@ static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
  * mostly as reference and a placeholder for possible future
  * non-trivial optimization[s]...
  */
-static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)
+static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
+                           const u8 *inp, size_t len)
 {
     u128 Z;
     int cnt;
     size_t rem, nlo, nhi;
-    const union { long one; char little; } is_endian = {1};
-
-#if 1
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+
+#   if 1
     do {
-	cnt  = 15;
-	nlo  = ((const u8 *)Xi)[15];
-	nlo ^= inp[15];
-	nhi  = nlo>>4;
-	nlo &= 0xf;
-
-	Z.hi = Htable[nlo].hi;
-	Z.lo = Htable[nlo].lo;
-
-	while (1) {
-		rem  = (size_t)Z.lo&0xf;
-		Z.lo = (Z.hi<<60)|(Z.lo>>4);
-		Z.hi = (Z.hi>>4);
-		if (sizeof(size_t)==8)
-			Z.hi ^= rem_4bit[rem];
-		else
-			Z.hi ^= (u64)rem_4bit[rem]<<32;
-
-		Z.hi ^= Htable[nhi].hi;
-		Z.lo ^= Htable[nhi].lo;
-
-		if (--cnt<0)		break;
-
-		nlo  = ((const u8 *)Xi)[cnt];
-		nlo ^= inp[cnt];
-		nhi  = nlo>>4;
-		nlo &= 0xf;
-
-		rem  = (size_t)Z.lo&0xf;
-		Z.lo = (Z.hi<<60)|(Z.lo>>4);
-		Z.hi = (Z.hi>>4);
-		if (sizeof(size_t)==8)
-			Z.hi ^= rem_4bit[rem];
-		else
-			Z.hi ^= (u64)rem_4bit[rem]<<32;
-
-		Z.hi ^= Htable[nlo].hi;
-		Z.lo ^= Htable[nlo].lo;
-	}
-#else
+        cnt = 15;
+        nlo = ((const u8 *)Xi)[15];
+        nlo ^= inp[15];
+        nhi = nlo >> 4;
+        nlo &= 0xf;
+
+        Z.hi = Htable[nlo].hi;
+        Z.lo = Htable[nlo].lo;
+
+        while (1) {
+            rem = (size_t)Z.lo & 0xf;
+            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+            Z.hi = (Z.hi >> 4);
+            if (sizeof(size_t) == 8)
+                Z.hi ^= rem_4bit[rem];
+            else
+                Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+            Z.hi ^= Htable[nhi].hi;
+            Z.lo ^= Htable[nhi].lo;
+
+            if (--cnt < 0)
+                break;
+
+            nlo = ((const u8 *)Xi)[cnt];
+            nlo ^= inp[cnt];
+            nhi = nlo >> 4;
+            nlo &= 0xf;
+
+            rem = (size_t)Z.lo & 0xf;
+            Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+            Z.hi = (Z.hi >> 4);
+            if (sizeof(size_t) == 8)
+                Z.hi ^= rem_4bit[rem];
+            else
+                Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+            Z.hi ^= Htable[nlo].hi;
+            Z.lo ^= Htable[nlo].lo;
+        }
+#   else
     /*
      * Extra 256+16 bytes per-key plus 512 bytes shared tables
      * [should] give ~50% improvement... One could have PACK()-ed
      * the rem_8bit even here, but the priority is to minimize
      * cache footprint...
-     */ 
-    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
-    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
+     */
+    u128 Hshr4[16];             /* Htable shifted right by 4 bits */
+    u8 Hshl4[16];               /* Htable shifted left by 4 bits */
     static const unsigned short rem_8bit[256] = {
-	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
-	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
-	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
-	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
-	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
-	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
-	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
-	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
-	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
-	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
-	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
-	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
-	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
-	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
-	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
-	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
-	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
-	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
-	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
-	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
-	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
-	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
-	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
-	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
-	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
-	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
-	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
-	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
-	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
-	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
-	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
-	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
+    };
     /*
      * This pre-processing phase slows down procedure by approximately
      * same time as it makes each loop spin faster. In other words
      * single block performance is approximately same as straightforward
      * "4-bit" implementation, and then it goes only faster...
      */
-    for (cnt=0; cnt<16; ++cnt) {
-	Z.hi = Htable[cnt].hi;
-	Z.lo = Htable[cnt].lo;
-	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
-	Hshr4[cnt].hi = (Z.hi>>4);
-	Hshl4[cnt]    = (u8)(Z.lo<<4);
+    for (cnt = 0; cnt < 16; ++cnt) {
+        Z.hi = Htable[cnt].hi;
+        Z.lo = Htable[cnt].lo;
+        Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
+        Hshr4[cnt].hi = (Z.hi >> 4);
+        Hshl4[cnt] = (u8)(Z.lo << 4);
     }
 
     do {
-	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
-		nlo  = ((const u8 *)Xi)[cnt];
-		nlo ^= inp[cnt];
-		nhi  = nlo>>4;
-		nlo &= 0xf;
+        for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
+            nlo = ((const u8 *)Xi)[cnt];
+            nlo ^= inp[cnt];
+            nhi = nlo >> 4;
+            nlo &= 0xf;
 
-		Z.hi ^= Htable[nlo].hi;
-		Z.lo ^= Htable[nlo].lo;
+            Z.hi ^= Htable[nlo].hi;
+            Z.lo ^= Htable[nlo].lo;
 
-		rem = (size_t)Z.lo&0xff;
+            rem = (size_t)Z.lo & 0xff;
 
-		Z.lo = (Z.hi<<56)|(Z.lo>>8);
-		Z.hi = (Z.hi>>8);
+            Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+            Z.hi = (Z.hi >> 8);
 
-		Z.hi ^= Hshr4[nhi].hi;
-		Z.lo ^= Hshr4[nhi].lo;
-		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
-	}
+            Z.hi ^= Hshr4[nhi].hi;
+            Z.lo ^= Hshr4[nhi].lo;
+            Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
+        }
 
-	nlo  = ((const u8 *)Xi)[0];
-	nlo ^= inp[0];
-	nhi  = nlo>>4;
-	nlo &= 0xf;
+        nlo = ((const u8 *)Xi)[0];
+        nlo ^= inp[0];
+        nhi = nlo >> 4;
+        nlo &= 0xf;
 
-	Z.hi ^= Htable[nlo].hi;
-	Z.lo ^= Htable[nlo].lo;
+        Z.hi ^= Htable[nlo].hi;
+        Z.lo ^= Htable[nlo].lo;
 
-	rem = (size_t)Z.lo&0xf;
+        rem = (size_t)Z.lo & 0xf;
 
-	Z.lo = (Z.hi<<60)|(Z.lo>>4);
-	Z.hi = (Z.hi>>4);
+        Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+        Z.hi = (Z.hi >> 4);
 
-	Z.hi ^= Htable[nhi].hi;
-	Z.lo ^= Htable[nhi].lo;
-	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
-#endif
+        Z.hi ^= Htable[nhi].hi;
+        Z.lo ^= Htable[nhi].lo;
+        Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
+#   endif
 
-	if (is_endian.little) {
-#ifdef BSWAP8
-		Xi[0] = BSWAP8(Z.hi);
-		Xi[1] = BSWAP8(Z.lo);
-#else
-		u8 *p = (u8 *)Xi;
-		u32 v;
-		v = (u32)(Z.hi>>32);	PUTU32(p,v);
-		v = (u32)(Z.hi);	PUTU32(p+4,v);
-		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
-		v = (u32)(Z.lo);	PUTU32(p+12,v);
-#endif
-	}
-	else {
-		Xi[0] = Z.hi;
-		Xi[1] = Z.lo;
-	}
-    } while (inp+=16, len-=16);
+        if (is_endian.little) {
+#   ifdef BSWAP8
+            Xi[0] = BSWAP8(Z.hi);
+            Xi[1] = BSWAP8(Z.lo);
+#   else
+            u8 *p = (u8 *)Xi;
+            u32 v;
+            v = (u32)(Z.hi >> 32);
+            PUTU32(p, v);
+            v = (u32)(Z.hi);
+            PUTU32(p + 4, v);
+            v = (u32)(Z.lo >> 32);
+            PUTU32(p + 8, v);
+            v = (u32)(Z.lo);
+            PUTU32(p + 12, v);
+#   endif
+        } else {
+            Xi[0] = Z.hi;
+            Xi[1] = Z.lo;
+        }
+    } while (inp += 16, len -= 16);
 }
-#endif
-#else
-void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
+#  endif
+# else
+void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                    size_t len);
+# endif
 
-#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
-#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
-/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
- * trashing effect. In other words idea is to hash data while it's
- * still in L1 cache after encryption pass... */
-#define GHASH_CHUNK       (3*1024)
-#endif
+# define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/*
+ * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
+ * effect. In other words idea is to hash data while it's still in L1 cache
+ * after encryption pass...
+ */
+#  define GHASH_CHUNK       (3*1024)
+# endif
 
-#else	/* TABLE_BITS */
+#else                           /* TABLE_BITS */
 
-static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
 {
-	u128 V,Z = { 0,0 };
-	long X;
-	int  i,j;
-	const long *xi = (const long *)Xi;
-	const union { long one; char little; } is_endian = {1};
-
-	V.hi = H[0];	/* H is in host byte order, no byte swapping */
-	V.lo = H[1];
-
-	for (j=0; j<16/sizeof(long); ++j) {
-		if (is_endian.little) {
-			if (sizeof(long)==8) {
-#ifdef BSWAP8
-				X = (long)(BSWAP8(xi[j]));
-#else
-				const u8 *p = (const u8 *)(xi+j);
-				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
-#endif
-			}
-			else {
-				const u8 *p = (const u8 *)(xi+j);
-				X = (long)GETU32(p);
-			}
-		}
-		else
-			X = xi[j];
-
-		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
-			u64 M = (u64)(X>>(8*sizeof(long)-1));
-			Z.hi ^= V.hi&M;
-			Z.lo ^= V.lo&M;
-
-			REDUCE1BIT(V);
-		}
-	}
-
-	if (is_endian.little) {
-#ifdef BSWAP8
-		Xi[0] = BSWAP8(Z.hi);
-		Xi[1] = BSWAP8(Z.lo);
-#else
-		u8 *p = (u8 *)Xi;
-		u32 v;
-		v = (u32)(Z.hi>>32);	PUTU32(p,v);
-		v = (u32)(Z.hi);	PUTU32(p+4,v);
-		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
-		v = (u32)(Z.lo);	PUTU32(p+12,v);
-#endif
-	}
-	else {
-		Xi[0] = Z.hi;
-		Xi[1] = Z.lo;
-	}
+    u128 V, Z = { 0, 0 };
+    long X;
+    int i, j;
+    const long *xi = (const long *)Xi;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+
+    V.hi = H[0];                /* H is in host byte order, no byte swapping */
+    V.lo = H[1];
+
+    for (j = 0; j < 16 / sizeof(long); ++j) {
+        if (is_endian.little) {
+            if (sizeof(long) == 8) {
+# ifdef BSWAP8
+                X = (long)(BSWAP8(xi[j]));
+# else
+                const u8 *p = (const u8 *)(xi + j);
+                X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
+# endif
+            } else {
+                const u8 *p = (const u8 *)(xi + j);
+                X = (long)GETU32(p);
+            }
+        } else
+            X = xi[j];
+
+        for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
+            u64 M = (u64)(X >> (8 * sizeof(long) - 1));
+            Z.hi ^= V.hi & M;
+            Z.lo ^= V.lo & M;
+
+            REDUCE1BIT(V);
+        }
+    }
+
+    if (is_endian.little) {
+# ifdef BSWAP8
+        Xi[0] = BSWAP8(Z.hi);
+        Xi[1] = BSWAP8(Z.lo);
+# else
+        u8 *p = (u8 *)Xi;
+        u32 v;
+        v = (u32)(Z.hi >> 32);
+        PUTU32(p, v);
+        v = (u32)(Z.hi);
+        PUTU32(p + 4, v);
+        v = (u32)(Z.lo >> 32);
+        PUTU32(p + 8, v);
+        v = (u32)(Z.lo);
+        PUTU32(p + 12, v);
+# endif
+    } else {
+        Xi[0] = Z.hi;
+        Xi[1] = Z.lo;
+    }
 }
-#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+# define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 
 #endif
 
-#if	TABLE_BITS==4 && defined(GHASH_ASM)
-# if	!defined(I386_ONLY) && \
-	(defined(__i386)	|| defined(__i386__)	|| \
-	 defined(__x86_64)	|| defined(__x86_64__)	|| \
-	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+#if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
+# if    !defined(I386_ONLY) && \
+        (defined(__i386)        || defined(__i386__)    || \
+         defined(__x86_64)      || defined(__x86_64__)  || \
+         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 #  define GHASH_ASM_X86_OR_64
 #  define GCM_FUNCREF_4BIT
 extern unsigned int OPENSSL_ia32cap_P[2];
 
-void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                     size_t len);
 
-#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define gcm_init_avx   gcm_init_clmul
+#   define gcm_gmult_avx  gcm_gmult_clmul
+#   define gcm_ghash_avx  gcm_ghash_clmul
+#  else
+void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                   size_t len);
+#  endif
+
+#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 #   define GHASH_ASM_X86
-void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                        size_t len);
 
-void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                        size_t len);
 #  endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 #  include "arm_arch.h"
-#  if __ARM_ARCH__>=7
+#  if __ARM_MAX_ARCH__>=7
 #   define GHASH_ASM_ARM
 #   define GCM_FUNCREF_4BIT
-void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
+#   if defined(__arm__) || defined(__arm)
+#    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
+#   endif
+void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                    size_t len);
+void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                  size_t len);
 #  endif
+# elif defined(__sparc__) || defined(__sparc)
+#  include "sparc_arch.h"
+#  define GHASH_ASM_SPARC
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_sparcv9cap_P[];
+void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                    size_t len);
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+#  include "ppc_arch.h"
+#  define GHASH_ASM_PPC
+#  define GCM_FUNCREF_4BIT
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                  size_t len);
 # endif
 #endif
 
 #ifdef GCM_FUNCREF_4BIT
 # undef  GCM_MUL
-# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 # ifdef GHASH
 #  undef  GHASH
-#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 # endif
 #endif
 
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
 {
-	const union { long one; char little; } is_endian = {1};
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
 
-	memset(ctx,0,sizeof(*ctx));
-	ctx->block = block;
-	ctx->key   = key;
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->block = block;
+    ctx->key = key;
 
-	(*block)(ctx->H.c,ctx->H.c,key);
+    (*block) (ctx->H.c, ctx->H.c, key);
 
-	if (is_endian.little) {
-		/* H is stored in host byte order */
+    if (is_endian.little) {
+        /* H is stored in host byte order */
 #ifdef BSWAP8
-		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
-		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+        ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+        ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 #else
-		u8 *p = ctx->H.c;
-		u64 hi,lo;
-		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
-		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
-		ctx->H.u[0] = hi;
-		ctx->H.u[1] = lo;
+        u8 *p = ctx->H.c;
+        u64 hi, lo;
+        hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+        lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+        ctx->H.u[0] = hi;
+        ctx->H.u[1] = lo;
 #endif
-	}
-
-#if	TABLE_BITS==8
-	gcm_init_8bit(ctx->Htable,ctx->H.u);
-#elif	TABLE_BITS==4
-# if	defined(GHASH_ASM_X86_OR_64)
-#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
-	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
-	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
-		gcm_init_clmul(ctx->Htable,ctx->H.u);
-		ctx->gmult = gcm_gmult_clmul;
-		ctx->ghash = gcm_ghash_clmul;
-		return;
-	}
+    }
+#if     TABLE_BITS==8
+    gcm_init_8bit(ctx->Htable, ctx->H.u);
+#elif   TABLE_BITS==4
+# if    defined(GHASH_ASM_X86_OR_64)
+#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+    if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
+        OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
+        if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
+            gcm_init_avx(ctx->Htable, ctx->H.u);
+            ctx->gmult = gcm_gmult_avx;
+            ctx->ghash = gcm_ghash_avx;
+        } else {
+            gcm_init_clmul(ctx->Htable, ctx->H.u);
+            ctx->gmult = gcm_gmult_clmul;
+            ctx->ghash = gcm_ghash_clmul;
+        }
+        return;
+    }
 #  endif
-	gcm_init_4bit(ctx->Htable,ctx->H.u);
-#  if	defined(GHASH_ASM_X86)			/* x86 only */
-#   if	defined(OPENSSL_IA32_SSE2)
-	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
+    gcm_init_4bit(ctx->Htable, ctx->H.u);
+#  if   defined(GHASH_ASM_X86)  /* x86 only */
+#   if  defined(OPENSSL_IA32_SSE2)
+    if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
 #   else
-	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
+    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
 #   endif
-		ctx->gmult = gcm_gmult_4bit_mmx;
-		ctx->ghash = gcm_ghash_4bit_mmx;
-	} else {
-		ctx->gmult = gcm_gmult_4bit_x86;
-		ctx->ghash = gcm_ghash_4bit_x86;
-	}
+        ctx->gmult = gcm_gmult_4bit_mmx;
+        ctx->ghash = gcm_ghash_4bit_mmx;
+    } else {
+        ctx->gmult = gcm_gmult_4bit_x86;
+        ctx->ghash = gcm_ghash_4bit_x86;
+    }
 #  else
-	ctx->gmult = gcm_gmult_4bit;
-	ctx->ghash = gcm_ghash_4bit;
+    ctx->gmult = gcm_gmult_4bit;
+    ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif  defined(GHASH_ASM_ARM)
+#  ifdef PMULL_CAPABLE
+    if (PMULL_CAPABLE) {
+        gcm_init_v8(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_v8;
+        ctx->ghash = gcm_ghash_v8;
+    } else
 #  endif
-# elif	defined(GHASH_ASM_ARM)
-	if (OPENSSL_armcap_P & ARMV7_NEON) {
-		ctx->gmult = gcm_gmult_neon;
-		ctx->ghash = gcm_ghash_neon;
-	} else {
-		gcm_init_4bit(ctx->Htable,ctx->H.u);
-		ctx->gmult = gcm_gmult_4bit;
-		ctx->ghash = gcm_ghash_4bit;
-	}
+#  ifdef NEON_CAPABLE
+    if (NEON_CAPABLE) {
+        gcm_init_neon(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_neon;
+        ctx->ghash = gcm_ghash_neon;
+    } else
+#  endif
+    {
+        gcm_init_4bit(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_4bit;
+        ctx->ghash = gcm_ghash_4bit;
+    }
+# elif  defined(GHASH_ASM_SPARC)
+    if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
+        gcm_init_vis3(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_vis3;
+        ctx->ghash = gcm_ghash_vis3;
+    } else {
+        gcm_init_4bit(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_4bit;
+        ctx->ghash = gcm_ghash_4bit;
+    }
+# elif  defined(GHASH_ASM_PPC)
+    if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
+        gcm_init_p8(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_p8;
+        ctx->ghash = gcm_ghash_p8;
+    } else {
+        gcm_init_4bit(ctx->Htable, ctx->H.u);
+        ctx->gmult = gcm_gmult_4bit;
+        ctx->ghash = gcm_ghash_4bit;
+    }
 # else
-	gcm_init_4bit(ctx->Htable,ctx->H.u);
+    gcm_init_4bit(ctx->Htable, ctx->H.u);
 # endif
 #endif
 }
 
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+                         size_t len)
 {
-	const union { long one; char little; } is_endian = {1};
-	unsigned int ctr;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    unsigned int ctr;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
-#endif
-
-	ctx->Yi.u[0]  = 0;
-	ctx->Yi.u[1]  = 0;
-	ctx->Xi.u[0]  = 0;
-	ctx->Xi.u[1]  = 0;
-	ctx->len.u[0] = 0;	/* AAD length */
-	ctx->len.u[1] = 0;	/* message length */
-	ctx->ares = 0;
-	ctx->mres = 0;
-
-	if (len==12) {
-		memcpy(ctx->Yi.c,iv,12);
-		ctx->Yi.c[15]=1;
-		ctr=1;
-	}
-	else {
-		size_t i;
-		u64 len0 = len;
-
-		while (len>=16) {
-			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
-			GCM_MUL(ctx,Yi);
-			iv += 16;
-			len -= 16;
-		}
-		if (len) {
-			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
-			GCM_MUL(ctx,Yi);
-		}
-		len0 <<= 3;
-		if (is_endian.little) {
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+    ctx->Yi.u[0] = 0;
+    ctx->Yi.u[1] = 0;
+    ctx->Xi.u[0] = 0;
+    ctx->Xi.u[1] = 0;
+    ctx->len.u[0] = 0;          /* AAD length */
+    ctx->len.u[1] = 0;          /* message length */
+    ctx->ares = 0;
+    ctx->mres = 0;
+
+    if (len == 12) {
+        memcpy(ctx->Yi.c, iv, 12);
+        ctx->Yi.c[15] = 1;
+        ctr = 1;
+    } else {
+        size_t i;
+        u64 len0 = len;
+
+        while (len >= 16) {
+            for (i = 0; i < 16; ++i)
+                ctx->Yi.c[i] ^= iv[i];
+            GCM_MUL(ctx, Yi);
+            iv += 16;
+            len -= 16;
+        }
+        if (len) {
+            for (i = 0; i < len; ++i)
+                ctx->Yi.c[i] ^= iv[i];
+            GCM_MUL(ctx, Yi);
+        }
+        len0 <<= 3;
+        if (is_endian.little) {
 #ifdef BSWAP8
-			ctx->Yi.u[1]  ^= BSWAP8(len0);
+            ctx->Yi.u[1] ^= BSWAP8(len0);
 #else
-			ctx->Yi.c[8]  ^= (u8)(len0>>56);
-			ctx->Yi.c[9]  ^= (u8)(len0>>48);
-			ctx->Yi.c[10] ^= (u8)(len0>>40);
-			ctx->Yi.c[11] ^= (u8)(len0>>32);
-			ctx->Yi.c[12] ^= (u8)(len0>>24);
-			ctx->Yi.c[13] ^= (u8)(len0>>16);
-			ctx->Yi.c[14] ^= (u8)(len0>>8);
-			ctx->Yi.c[15] ^= (u8)(len0);
+            ctx->Yi.c[8] ^= (u8)(len0 >> 56);
+            ctx->Yi.c[9] ^= (u8)(len0 >> 48);
+            ctx->Yi.c[10] ^= (u8)(len0 >> 40);
+            ctx->Yi.c[11] ^= (u8)(len0 >> 32);
+            ctx->Yi.c[12] ^= (u8)(len0 >> 24);
+            ctx->Yi.c[13] ^= (u8)(len0 >> 16);
+            ctx->Yi.c[14] ^= (u8)(len0 >> 8);
+            ctx->Yi.c[15] ^= (u8)(len0);
 #endif
-		}
-		else
-			ctx->Yi.u[1]  ^= len0;
+        } else
+            ctx->Yi.u[1] ^= len0;
 
-		GCM_MUL(ctx,Yi);
+        GCM_MUL(ctx, Yi);
 
-		if (is_endian.little)
+        if (is_endian.little)
 #ifdef BSWAP4
-			ctr = BSWAP4(ctx->Yi.d[3]);
+            ctr = BSWAP4(ctx->Yi.d[3]);
 #else
-			ctr = GETU32(ctx->Yi.c+12);
+            ctr = GETU32(ctx->Yi.c + 12);
 #endif
-		else
-			ctr = ctx->Yi.d[3];
-	}
+        else
+            ctr = ctx->Yi.d[3];
+    }
 
-	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
-	++ctr;
-	if (is_endian.little)
+    (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
+    ++ctr;
+    if (is_endian.little)
 #ifdef BSWAP4
-		ctx->Yi.d[3] = BSWAP4(ctr);
+        ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-		PUTU32(ctx->Yi.c+12,ctr);
+        PUTU32(ctx->Yi.c + 12, ctr);
 #endif
-	else
-		ctx->Yi.d[3] = ctr;
+    else
+        ctx->Yi.d[3] = ctr;
 }
 
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+                      size_t len)
 {
-	size_t i;
-	unsigned int n;
-	u64 alen = ctx->len.u[0];
+    size_t i;
+    unsigned int n;
+    u64 alen = ctx->len.u[0];
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 # ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx->ghash;
+    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif
 
-	if (ctx->len.u[1]) return -2;
-
-	alen += len;
-	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
-		return -1;
-	ctx->len.u[0] = alen;
-
-	n = ctx->ares;
-	if (n) {
-		while (n && len) {
-			ctx->Xi.c[n] ^= *(aad++);
-			--len;
-			n = (n+1)%16;
-		}
-		if (n==0) GCM_MUL(ctx,Xi);
-		else {
-			ctx->ares = n;
-			return 0;
-		}
-	}
-
+    if (ctx->len.u[1])
+        return -2;
+
+    alen += len;
+    if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
+        return -1;
+    ctx->len.u[0] = alen;
+
+    n = ctx->ares;
+    if (n) {
+        while (n && len) {
+            ctx->Xi.c[n] ^= *(aad++);
+            --len;
+            n = (n + 1) % 16;
+        }
+        if (n == 0)
+            GCM_MUL(ctx, Xi);
+        else {
+            ctx->ares = n;
+            return 0;
+        }
+    }
 #ifdef GHASH
-	if ((i = (len&(size_t)-16))) {
-		GHASH(ctx,aad,i);
-		aad += i;
-		len -= i;
-	}
+    if ((i = (len & (size_t)-16))) {
+        GHASH(ctx, aad, i);
+        aad += i;
+        len -= i;
+    }
 #else
-	while (len>=16) {
-		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
-		GCM_MUL(ctx,Xi);
-		aad += 16;
-		len -= 16;
-	}
+    while (len >= 16) {
+        for (i = 0; i < 16; ++i)
+            ctx->Xi.c[i] ^= aad[i];
+        GCM_MUL(ctx, Xi);
+        aad += 16;
+        len -= 16;
+    }
 #endif
-	if (len) {
-		n = (unsigned int)len;
-		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
-	}
+    if (len) {
+        n = (unsigned int)len;
+        for (i = 0; i < len; ++i)
+            ctx->Xi.c[i] ^= aad[i];
+    }
 
-	ctx->ares = n;
-	return 0;
+    ctx->ares = n;
+    return 0;
 }
 
 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-		const unsigned char *in, unsigned char *out,
-		size_t len)
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
 {
-	const union { long one; char little; } is_endian = {1};
-	unsigned int n, ctr;
-	size_t i;
-	u64        mlen  = ctx->len.u[1];
-	block128_f block = ctx->block;
-	void      *key   = ctx->key;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    unsigned int n, ctr;
+    size_t i;
+    u64 mlen = ctx->len.u[1];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 # ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx->ghash;
+    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif
 
 #if 0
-	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+    n = (unsigned int)mlen % 16; /* alternative to ctx->mres */
 #endif
-	mlen += len;
-	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-		return -1;
-	ctx->len.u[1] = mlen;
-
-	if (ctx->ares) {
-		/* First call to encrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx,Xi);
-		ctx->ares = 0;
-	}
-
-	if (is_endian.little)
-#ifdef BSWAP4
-		ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-		ctr = GETU32(ctx->Yi.c+12);
-#endif
-	else
-		ctr = ctx->Yi.d[3];
+    mlen += len;
+    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+        return -1;
+    ctx->len.u[1] = mlen;
 
-	n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do {	/* always true actually */
-		if (n) {
-			while (n && len) {
-				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
-				--len;
-				n = (n+1)%16;
-			}
-			if (n==0) GCM_MUL(ctx,Xi);
-			else {
-				ctx->mres = n;
-				return 0;
-			}
-		}
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
-			break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-		while (len>=GHASH_CHUNK) {
-		    size_t j=GHASH_CHUNK;
-
-		    while (j) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
+    if (ctx->ares) {
+        /* First call to encrypt finalizes GHASH(AAD) */
+        GCM_MUL(ctx, Xi);
+        ctx->ares = 0;
+    }
 
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
+    if (is_endian.little)
 #ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
+        ctr = BSWAP4(ctx->Yi.d[3]);
 #else
-				PUTU32(ctx->Yi.c+12,ctr);
+        ctr = GETU32(ctx->Yi.c + 12);
 #endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i)
-				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
-			out += 16;
-			in  += 16;
-			j   -= 16;
-		    }
-		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
-		    len -= GHASH_CHUNK;
-		}
-		if ((i = (len&(size_t)-16))) {
-		    size_t j=i;
-
-		    while (len>=16) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
-
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i)
-				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
-			out += 16;
-			in  += 16;
-			len -= 16;
-		    }
-		    GHASH(ctx,out-j,j);
-		}
-#else
-		while (len>=16) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
+    else
+        ctr = ctx->Yi.d[3];
 
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i)
-				ctx->Xi.t[i] ^=
-				out_t[i] = in_t[i]^ctx->EKi.t[i];
-			GCM_MUL(ctx,Xi);
-			out += 16;
-			in  += 16;
-			len -= 16;
-		}
-#endif
-		if (len) {
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			while (len--) {
-				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
-				++n;
-			}
-		}
-
-		ctx->mres = n;
-		return 0;
-	} while(0);
+    n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            if (n) {
+                while (n && len) {
+                    ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+                    --len;
+                    n = (n + 1) % 16;
+                }
+                if (n == 0)
+                    GCM_MUL(ctx, Xi);
+                else {
+                    ctx->mres = n;
+                    return 0;
+                }
+            }
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+                break;
+# endif
+# if defined(GHASH) && defined(GHASH_CHUNK)
+            while (len >= GHASH_CHUNK) {
+                size_t j = GHASH_CHUNK;
+
+                while (j) {
+                    size_t *out_t = (size_t *)out;
+                    const size_t *in_t = (const size_t *)in;
+
+                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                    ++ctr;
+                    if (is_endian.little)
+#  ifdef BSWAP4
+                        ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                        PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                    else
+                        ctx->Yi.d[3] = ctr;
+                    for (i = 0; i < 16 / sizeof(size_t); ++i)
+                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+                    out += 16;
+                    in += 16;
+                    j -= 16;
+                }
+                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
+                len -= GHASH_CHUNK;
+            }
+            if ((i = (len & (size_t)-16))) {
+                size_t j = i;
+
+                while (len >= 16) {
+                    size_t *out_t = (size_t *)out;
+                    const size_t *in_t = (const size_t *)in;
+
+                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                    ++ctr;
+                    if (is_endian.little)
+#  ifdef BSWAP4
+                        ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                        PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                    else
+                        ctx->Yi.d[3] = ctr;
+                    for (i = 0; i < 16 / sizeof(size_t); ++i)
+                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+                    out += 16;
+                    in += 16;
+                    len -= 16;
+                }
+                GHASH(ctx, out - j, j);
+            }
+# else
+            while (len >= 16) {
+                size_t *out_t = (size_t *)out;
+                const size_t *in_t = (const size_t *)in;
+
+                (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                ++ctr;
+                if (is_endian.little)
+#  ifdef BSWAP4
+                    ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                    PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                else
+                    ctx->Yi.d[3] = ctr;
+                for (i = 0; i < 16 / sizeof(size_t); ++i)
+                    ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+                GCM_MUL(ctx, Xi);
+                out += 16;
+                in += 16;
+                len -= 16;
+            }
+# endif
+            if (len) {
+                (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                ++ctr;
+                if (is_endian.little)
+# ifdef BSWAP4
+                    ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+                    PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+                else
+                    ctx->Yi.d[3] = ctr;
+                while (len--) {
+                    ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+                    ++n;
+                }
+            }
+
+            ctx->mres = n;
+            return 0;
+        } while (0);
+    }
 #endif
-	for (i=0;i<len;++i) {
-		if (n==0) {
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
+    for (i = 0; i < len; ++i) {
+        if (n == 0) {
+            (*block) (ctx->Yi.c, ctx->EKi.c, key);
+            ++ctr;
+            if (is_endian.little)
 #ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
+                ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-		}
-		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
-		n = (n+1)%16;
-		if (n==0)
-			GCM_MUL(ctx,Xi);
-	}
-
-	ctx->mres = n;
-	return 0;
+                PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+            else
+                ctx->Yi.d[3] = ctr;
+        }
+        ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
+        n = (n + 1) % 16;
+        if (n == 0)
+            GCM_MUL(ctx, Xi);
+    }
+
+    ctx->mres = n;
+    return 0;
 }
 
 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-		const unsigned char *in, unsigned char *out,
-		size_t len)
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
 {
-	const union { long one; char little; } is_endian = {1};
-	unsigned int n, ctr;
-	size_t i;
-	u64        mlen  = ctx->len.u[1];
-	block128_f block = ctx->block;
-	void      *key   = ctx->key;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    unsigned int n, ctr;
+    size_t i;
+    u64 mlen = ctx->len.u[1];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 # ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx->ghash;
+    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif
 
-	mlen += len;
-	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-		return -1;
-	ctx->len.u[1] = mlen;
+    mlen += len;
+    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+        return -1;
+    ctx->len.u[1] = mlen;
 
-	if (ctx->ares) {
-		/* First call to decrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx,Xi);
-		ctx->ares = 0;
-	}
+    if (ctx->ares) {
+        /* First call to decrypt finalizes GHASH(AAD) */
+        GCM_MUL(ctx, Xi);
+        ctx->ares = 0;
+    }
 
-	if (is_endian.little)
+    if (is_endian.little)
 #ifdef BSWAP4
-		ctr = BSWAP4(ctx->Yi.d[3]);
+        ctr = BSWAP4(ctx->Yi.d[3]);
 #else
-		ctr = GETU32(ctx->Yi.c+12);
+        ctr = GETU32(ctx->Yi.c + 12);
 #endif
-	else
-		ctr = ctx->Yi.d[3];
+    else
+        ctr = ctx->Yi.d[3];
 
-	n = ctx->mres;
+    n = ctx->mres;
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do {	/* always true actually */
-		if (n) {
-			while (n && len) {
-				u8 c = *(in++);
-				*(out++) = c^ctx->EKi.c[n];
-				ctx->Xi.c[n] ^= c;
-				--len;
-				n = (n+1)%16;
-			}
-			if (n==0) GCM_MUL (ctx,Xi);
-			else {
-				ctx->mres = n;
-				return 0;
-			}
-		}
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
-			break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-		while (len>=GHASH_CHUNK) {
-		    size_t j=GHASH_CHUNK;
-
-		    GHASH(ctx,in,GHASH_CHUNK);
-		    while (j) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
-
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            if (n) {
+                while (n && len) {
+                    u8 c = *(in++);
+                    *(out++) = c ^ ctx->EKi.c[n];
+                    ctx->Xi.c[n] ^= c;
+                    --len;
+                    n = (n + 1) % 16;
+                }
+                if (n == 0)
+                    GCM_MUL(ctx, Xi);
+                else {
+                    ctx->mres = n;
+                    return 0;
+                }
+            }
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+                break;
+# endif
+# if defined(GHASH) && defined(GHASH_CHUNK)
+            while (len >= GHASH_CHUNK) {
+                size_t j = GHASH_CHUNK;
+
+                GHASH(ctx, in, GHASH_CHUNK);
+                while (j) {
+                    size_t *out_t = (size_t *)out;
+                    const size_t *in_t = (const size_t *)in;
+
+                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                    ++ctr;
+                    if (is_endian.little)
+#  ifdef BSWAP4
+                        ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                        PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                    else
+                        ctx->Yi.d[3] = ctr;
+                    for (i = 0; i < 16 / sizeof(size_t); ++i)
+                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+                    out += 16;
+                    in += 16;
+                    j -= 16;
+                }
+                len -= GHASH_CHUNK;
+            }
+            if ((i = (len & (size_t)-16))) {
+                GHASH(ctx, in, i);
+                while (len >= 16) {
+                    size_t *out_t = (size_t *)out;
+                    const size_t *in_t = (const size_t *)in;
+
+                    (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                    ++ctr;
+                    if (is_endian.little)
+#  ifdef BSWAP4
+                        ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                        PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                    else
+                        ctx->Yi.d[3] = ctr;
+                    for (i = 0; i < 16 / sizeof(size_t); ++i)
+                        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+                    out += 16;
+                    in += 16;
+                    len -= 16;
+                }
+            }
+# else
+            while (len >= 16) {
+                size_t *out_t = (size_t *)out;
+                const size_t *in_t = (const size_t *)in;
+
+                (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                ++ctr;
+                if (is_endian.little)
+#  ifdef BSWAP4
+                    ctx->Yi.d[3] = BSWAP4(ctr);
+#  else
+                    PUTU32(ctx->Yi.c + 12, ctr);
+#  endif
+                else
+                    ctx->Yi.d[3] = ctr;
+                for (i = 0; i < 16 / sizeof(size_t); ++i) {
+                    size_t c = in[i];
+                    out[i] = c ^ ctx->EKi.t[i];
+                    ctx->Xi.t[i] ^= c;
+                }
+                GCM_MUL(ctx, Xi);
+                out += 16;
+                in += 16;
+                len -= 16;
+            }
+# endif
+            if (len) {
+                (*block) (ctx->Yi.c, ctx->EKi.c, key);
+                ++ctr;
+                if (is_endian.little)
+# ifdef BSWAP4
+                    ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+                    PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+                else
+                    ctx->Yi.d[3] = ctr;
+                while (len--) {
+                    u8 c = in[n];
+                    ctx->Xi.c[n] ^= c;
+                    out[n] = c ^ ctx->EKi.c[n];
+                    ++n;
+                }
+            }
+
+            ctx->mres = n;
+            return 0;
+        } while (0);
+    }
 #endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i)
-				out_t[i] = in_t[i]^ctx->EKi.t[i];
-			out += 16;
-			in  += 16;
-			j   -= 16;
-		    }
-		    len -= GHASH_CHUNK;
-		}
-		if ((i = (len&(size_t)-16))) {
-		    GHASH(ctx,in,i);
-		    while (len>=16) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
-
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
+    for (i = 0; i < len; ++i) {
+        u8 c;
+        if (n == 0) {
+            (*block) (ctx->Yi.c, ctx->EKi.c, key);
+            ++ctr;
+            if (is_endian.little)
 #ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
+                ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i)
-				out_t[i] = in_t[i]^ctx->EKi.t[i];
-			out += 16;
-			in  += 16;
-			len -= 16;
-		    }
-		}
-#else
-		while (len>=16) {
-		    	size_t *out_t=(size_t *)out;
-		    	const size_t *in_t=(const size_t *)in;
+                PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+            else
+                ctx->Yi.d[3] = ctr;
+        }
+        c = in[i];
+        out[i] = c ^ ctx->EKi.c[n];
+        ctx->Xi.c[n] ^= c;
+        n = (n + 1) % 16;
+        if (n == 0)
+            GCM_MUL(ctx, Xi);
+    }
 
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			for (i=0; i<16/sizeof(size_t); ++i) {
-				size_t c = in[i];
-				out[i] = c^ctx->EKi.t[i];
-				ctx->Xi.t[i] ^= c;
-			}
-			GCM_MUL(ctx,Xi);
-			out += 16;
-			in  += 16;
-			len -= 16;
-		}
-#endif
-		if (len) {
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-			while (len--) {
-				u8 c = in[n];
-				ctx->Xi.c[n] ^= c;
-				out[n] = c^ctx->EKi.c[n];
-				++n;
-			}
-		}
-
-		ctx->mres = n;
-		return 0;
-	} while(0);
-#endif
-	for (i=0;i<len;++i) {
-		u8 c;
-		if (n==0) {
-			(*block)(ctx->Yi.c,ctx->EKi.c,key);
-			++ctr;
-			if (is_endian.little)
-#ifdef BSWAP4
-				ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-				PUTU32(ctx->Yi.c+12,ctr);
-#endif
-			else
-				ctx->Yi.d[3] = ctr;
-		}
-		c = in[i];
-		out[i] = c^ctx->EKi.c[n];
-		ctx->Xi.c[n] ^= c;
-		n = (n+1)%16;
-		if (n==0)
-			GCM_MUL(ctx,Xi);
-	}
-
-	ctx->mres = n;
-	return 0;
+    ctx->mres = n;
+    return 0;
 }
 
 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-		const unsigned char *in, unsigned char *out,
-		size_t len, ctr128_f stream)
+                                const unsigned char *in, unsigned char *out,
+                                size_t len, ctr128_f stream)
 {
-	const union { long one; char little; } is_endian = {1};
-	unsigned int n, ctr;
-	size_t i;
-	u64   mlen = ctx->len.u[1];
-	void *key  = ctx->key;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    unsigned int n, ctr;
+    size_t i;
+    u64 mlen = ctx->len.u[1];
+    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 # ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx->ghash;
+    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif
 
-	mlen += len;
-	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-		return -1;
-	ctx->len.u[1] = mlen;
+    mlen += len;
+    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+        return -1;
+    ctx->len.u[1] = mlen;
 
-	if (ctx->ares) {
-		/* First call to encrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx,Xi);
-		ctx->ares = 0;
-	}
+    if (ctx->ares) {
+        /* First call to encrypt finalizes GHASH(AAD) */
+        GCM_MUL(ctx, Xi);
+        ctx->ares = 0;
+    }
 
-	if (is_endian.little)
+    if (is_endian.little)
 #ifdef BSWAP4
-		ctr = BSWAP4(ctx->Yi.d[3]);
+        ctr = BSWAP4(ctx->Yi.d[3]);
 #else
-		ctr = GETU32(ctx->Yi.c+12);
-#endif
-	else
-		ctr = ctx->Yi.d[3];
-
-	n = ctx->mres;
-	if (n) {
-		while (n && len) {
-			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
-			--len;
-			n = (n+1)%16;
-		}
-		if (n==0) GCM_MUL(ctx,Xi);
-		else {
-			ctx->mres = n;
-			return 0;
-		}
-	}
+        ctr = GETU32(ctx->Yi.c + 12);
+#endif
+    else
+        ctr = ctx->Yi.d[3];
+
+    n = ctx->mres;
+    if (n) {
+        while (n && len) {
+            ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+            --len;
+            n = (n + 1) % 16;
+        }
+        if (n == 0)
+            GCM_MUL(ctx, Xi);
+        else {
+            ctx->mres = n;
+            return 0;
+        }
+    }
 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-	while (len>=GHASH_CHUNK) {
-		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
-		ctr += GHASH_CHUNK/16;
-		if (is_endian.little)
-#ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-			PUTU32(ctx->Yi.c+12,ctr);
-#endif
-		else
-			ctx->Yi.d[3] = ctr;
-		GHASH(ctx,out,GHASH_CHUNK);
-		out += GHASH_CHUNK;
-		in  += GHASH_CHUNK;
-		len -= GHASH_CHUNK;
-	}
+    while (len >= GHASH_CHUNK) {
+        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+        ctr += GHASH_CHUNK / 16;
+        if (is_endian.little)
+# ifdef BSWAP4
+            ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+            PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+        else
+            ctx->Yi.d[3] = ctr;
+        GHASH(ctx, out, GHASH_CHUNK);
+        out += GHASH_CHUNK;
+        in += GHASH_CHUNK;
+        len -= GHASH_CHUNK;
+    }
 #endif
-	if ((i = (len&(size_t)-16))) {
-		size_t j=i/16;
+    if ((i = (len & (size_t)-16))) {
+        size_t j = i / 16;
 
-		(*stream)(in,out,j,key,ctx->Yi.c);
-		ctr += (unsigned int)j;
-		if (is_endian.little)
+        (*stream) (in, out, j, key, ctx->Yi.c);
+        ctr += (unsigned int)j;
+        if (is_endian.little)
 #ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
+            ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-			PUTU32(ctx->Yi.c+12,ctr);
+            PUTU32(ctx->Yi.c + 12, ctr);
 #endif
-		else
-			ctx->Yi.d[3] = ctr;
-		in  += i;
-		len -= i;
+        else
+            ctx->Yi.d[3] = ctr;
+        in += i;
+        len -= i;
 #if defined(GHASH)
-		GHASH(ctx,out,i);
-		out += i;
+        GHASH(ctx, out, i);
+        out += i;
 #else
-		while (j--) {
-			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
-			GCM_MUL(ctx,Xi);
-			out += 16;
-		}
+        while (j--) {
+            for (i = 0; i < 16; ++i)
+                ctx->Xi.c[i] ^= out[i];
+            GCM_MUL(ctx, Xi);
+            out += 16;
+        }
 #endif
-	}
-	if (len) {
-		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
-		++ctr;
-		if (is_endian.little)
+    }
+    if (len) {
+        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+        ++ctr;
+        if (is_endian.little)
 #ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
+            ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-			PUTU32(ctx->Yi.c+12,ctr);
-#endif
-		else
-			ctx->Yi.d[3] = ctr;
-		while (len--) {
-			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
-			++n;
-		}
-	}
-
-	ctx->mres = n;
-	return 0;
+            PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+        else
+            ctx->Yi.d[3] = ctr;
+        while (len--) {
+            ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+            ++n;
+        }
+    }
+
+    ctx->mres = n;
+    return 0;
 }
 
 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-		const unsigned char *in, unsigned char *out,
-		size_t len,ctr128_f stream)
+                                const unsigned char *in, unsigned char *out,
+                                size_t len, ctr128_f stream)
 {
-	const union { long one; char little; } is_endian = {1};
-	unsigned int n, ctr;
-	size_t i;
-	u64   mlen = ctx->len.u[1];
-	void *key  = ctx->key;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    unsigned int n, ctr;
+    size_t i;
+    u64 mlen = ctx->len.u[1];
+    void *key = ctx->key;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 # ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx->ghash;
+    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                         const u8 *inp, size_t len) = ctx->ghash;
 # endif
 #endif
 
-	mlen += len;
-	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-		return -1;
-	ctx->len.u[1] = mlen;
+    mlen += len;
+    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+        return -1;
+    ctx->len.u[1] = mlen;
 
-	if (ctx->ares) {
-		/* First call to decrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx,Xi);
-		ctx->ares = 0;
-	}
+    if (ctx->ares) {
+        /* First call to decrypt finalizes GHASH(AAD) */
+        GCM_MUL(ctx, Xi);
+        ctx->ares = 0;
+    }
 
-	if (is_endian.little)
+    if (is_endian.little)
 #ifdef BSWAP4
-		ctr = BSWAP4(ctx->Yi.d[3]);
+        ctr = BSWAP4(ctx->Yi.d[3]);
 #else
-		ctr = GETU32(ctx->Yi.c+12);
-#endif
-	else
-		ctr = ctx->Yi.d[3];
-
-	n = ctx->mres;
-	if (n) {
-		while (n && len) {
-			u8 c = *(in++);
-			*(out++) = c^ctx->EKi.c[n];
-			ctx->Xi.c[n] ^= c;
-			--len;
-			n = (n+1)%16;
-		}
-		if (n==0) GCM_MUL (ctx,Xi);
-		else {
-			ctx->mres = n;
-			return 0;
-		}
-	}
+        ctr = GETU32(ctx->Yi.c + 12);
+#endif
+    else
+        ctr = ctx->Yi.d[3];
+
+    n = ctx->mres;
+    if (n) {
+        while (n && len) {
+            u8 c = *(in++);
+            *(out++) = c ^ ctx->EKi.c[n];
+            ctx->Xi.c[n] ^= c;
+            --len;
+            n = (n + 1) % 16;
+        }
+        if (n == 0)
+            GCM_MUL(ctx, Xi);
+        else {
+            ctx->mres = n;
+            return 0;
+        }
+    }
 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-	while (len>=GHASH_CHUNK) {
-		GHASH(ctx,in,GHASH_CHUNK);
-		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
-		ctr += GHASH_CHUNK/16;
-		if (is_endian.little)
-#ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-			PUTU32(ctx->Yi.c+12,ctr);
-#endif
-		else
-			ctx->Yi.d[3] = ctr;
-		out += GHASH_CHUNK;
-		in  += GHASH_CHUNK;
-		len -= GHASH_CHUNK;
-	}
+    while (len >= GHASH_CHUNK) {
+        GHASH(ctx, in, GHASH_CHUNK);
+        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+        ctr += GHASH_CHUNK / 16;
+        if (is_endian.little)
+# ifdef BSWAP4
+            ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+            PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+        else
+            ctx->Yi.d[3] = ctr;
+        out += GHASH_CHUNK;
+        in += GHASH_CHUNK;
+        len -= GHASH_CHUNK;
+    }
 #endif
-	if ((i = (len&(size_t)-16))) {
-		size_t j=i/16;
+    if ((i = (len & (size_t)-16))) {
+        size_t j = i / 16;
 
 #if defined(GHASH)
-		GHASH(ctx,in,i);
+        GHASH(ctx, in, i);
 #else
-		while (j--) {
-			size_t k;
-			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
-			GCM_MUL(ctx,Xi);
-			in += 16;
-		}
-		j   = i/16;
-		in -= i;
-#endif
-		(*stream)(in,out,j,key,ctx->Yi.c);
-		ctr += (unsigned int)j;
-		if (is_endian.little)
+        while (j--) {
+            size_t k;
+            for (k = 0; k < 16; ++k)
+                ctx->Xi.c[k] ^= in[k];
+            GCM_MUL(ctx, Xi);
+            in += 16;
+        }
+        j = i / 16;
+        in -= i;
+#endif
+        (*stream) (in, out, j, key, ctx->Yi.c);
+        ctr += (unsigned int)j;
+        if (is_endian.little)
 #ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
+            ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-			PUTU32(ctx->Yi.c+12,ctr);
+            PUTU32(ctx->Yi.c + 12, ctr);
 #endif
-		else
-			ctx->Yi.d[3] = ctr;
-		out += i;
-		in  += i;
-		len -= i;
-	}
-	if (len) {
-		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
-		++ctr;
-		if (is_endian.little)
+        else
+            ctx->Yi.d[3] = ctr;
+        out += i;
+        in += i;
+        len -= i;
+    }
+    if (len) {
+        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+        ++ctr;
+        if (is_endian.little)
 #ifdef BSWAP4
-			ctx->Yi.d[3] = BSWAP4(ctr);
+            ctx->Yi.d[3] = BSWAP4(ctr);
 #else
-			PUTU32(ctx->Yi.c+12,ctr);
-#endif
-		else
-			ctx->Yi.d[3] = ctr;
-		while (len--) {
-			u8 c = in[n];
-			ctx->Xi.c[n] ^= c;
-			out[n] = c^ctx->EKi.c[n];
-			++n;
-		}
-	}
-
-	ctx->mres = n;
-	return 0;
+            PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+        else
+            ctx->Yi.d[3] = ctr;
+        while (len--) {
+            u8 c = in[n];
+            ctx->Xi.c[n] ^= c;
+            out[n] = c ^ ctx->EKi.c[n];
+            ++n;
+        }
+    }
+
+    ctx->mres = n;
+    return 0;
 }
 
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
-			size_t len)
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
+                         size_t len)
 {
-	const union { long one; char little; } is_endian = {1};
-	u64 alen = ctx->len.u[0]<<3;
-	u64 clen = ctx->len.u[1]<<3;
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    u64 alen = ctx->len.u[0] << 3;
+    u64 clen = ctx->len.u[1] << 3;
 #ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 #endif
 
-	if (ctx->mres || ctx->ares)
-		GCM_MUL(ctx,Xi);
+    if (ctx->mres || ctx->ares)
+        GCM_MUL(ctx, Xi);
 
-	if (is_endian.little) {
+    if (is_endian.little) {
 #ifdef BSWAP8
-		alen = BSWAP8(alen);
-		clen = BSWAP8(clen);
+        alen = BSWAP8(alen);
+        clen = BSWAP8(clen);
 #else
-		u8 *p = ctx->len.c;
+        u8 *p = ctx->len.c;
 
-		ctx->len.u[0] = alen;
-		ctx->len.u[1] = clen;
+        ctx->len.u[0] = alen;
+        ctx->len.u[1] = clen;
 
-		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
-		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+        alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
+        clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
 #endif
-	}
+    }
 
-	ctx->Xi.u[0] ^= alen;
-	ctx->Xi.u[1] ^= clen;
-	GCM_MUL(ctx,Xi);
+    ctx->Xi.u[0] ^= alen;
+    ctx->Xi.u[1] ^= clen;
+    GCM_MUL(ctx, Xi);
 
-	ctx->Xi.u[0] ^= ctx->EK0.u[0];
-	ctx->Xi.u[1] ^= ctx->EK0.u[1];
+    ctx->Xi.u[0] ^= ctx->EK0.u[0];
+    ctx->Xi.u[1] ^= ctx->EK0.u[1];
 
-	if (tag && len<=sizeof(ctx->Xi))
-		return memcmp(ctx->Xi.c,tag,len);
-	else
-		return -1;
+    if (tag && len <= sizeof(ctx->Xi))
+        return memcmp(ctx->Xi.c, tag, len);
+    else
+        return -1;
 }
 
 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
 {
-	CRYPTO_gcm128_finish(ctx, NULL, 0);
-	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+    CRYPTO_gcm128_finish(ctx, NULL, 0);
+    memcpy(tag, ctx->Xi.c,
+           len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
 }
 
 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
 {
-	GCM128_CONTEXT *ret;
+    GCM128_CONTEXT *ret;
 
-	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
-		CRYPTO_gcm128_init(ret,key,block);
+    if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+        CRYPTO_gcm128_init(ret, key, block);
 
-	return ret;
+    return ret;
 }
 
 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
 {
-	if (ctx) {
-		OPENSSL_cleanse(ctx,sizeof(*ctx));
-		OPENSSL_free(ctx);
-	}
+    if (ctx) {
+        OPENSSL_cleanse(ctx, sizeof(*ctx));
+        OPENSSL_free(ctx);
+    }
 }
 
 #if defined(SELFTEST)
-#include <stdio.h>
-#include <openssl/aes.h>
+# include <stdio.h>
+# include <openssl/aes.h>
 
 /* Test Case 1 */
-static const u8	K1[16],
-		*P1=NULL,
-		*A1=NULL,
-		IV1[12],
-		*C1=NULL,
-		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL;
+static const u8 T1[] = {
+    0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61,
+    0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a
+};
 
 /* Test Case 2 */
-#define K2 K1
-#define A2 A1
-#define IV2 IV1
-static const u8	P2[16],
-		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
-		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+# define K2 K1
+# define A2 A1
+# define IV2 IV1
+static const u8 P2[16];
+static const u8 C2[] = {
+    0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92,
+    0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78
+};
+
+static const u8 T2[] = {
+    0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd,
+    0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf
+};
 
 /* Test Case 3 */
-#define A3 A2
-static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
-		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
-			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
-			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
-			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
-		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+# define A3 A2
+static const u8 K3[] = {
+    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P3[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV3[] = {
+    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+    0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C3[] = {
+    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+    0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85
+};
+
+static const u8 T3[] = {
+    0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6,
+    0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4
+};
 
 /* Test Case 4 */
-#define K4 K3
-#define IV4 IV3
-static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-			0xab,0xad,0xda,0xd2},
-		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
-			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
-			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
-			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
-		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+# define K4 K3
+# define IV4 IV3
+static const u8 P4[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A4[] = {
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C4[] = {
+    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+    0x3d, 0x58, 0xe0, 0x91
+};
+
+static const u8 T4[] = {
+    0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb,
+    0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47
+};
 
 /* Test Case 5 */
-#define K5 K4
-#define P5 P4
-#define A5 A4
-static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
-			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
-			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
-			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
-		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+# define K5 K4
+# define P5 P4
+# define A5 A4
+static const u8 IV5[] = {
+    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad
+};
+
+static const u8 C5[] = {
+    0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a,
+    0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55,
+    0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8,
+    0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23,
+    0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2,
+    0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42,
+    0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07,
+    0xc2, 0x3f, 0x45, 0x98
+};
+
+static const u8 T5[] = {
+    0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85,
+    0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb
+};
 
 /* Test Case 6 */
-#define K6 K5
-#define P6 P5
-#define A6 A5
-static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
-			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
-			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
-			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
-		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+# define K6 K5
+# define P6 P5
+# define A6 A5
+static const u8 IV6[] = {
+    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+    0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C6[] = {
+    0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6,
+    0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94,
+    0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8,
+    0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7,
+    0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90,
+    0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f,
+    0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03,
+    0x4c, 0x34, 0xae, 0xe5
+};
+
+static const u8 T6[] = {
+    0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa,
+    0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50
+};
 
 /* Test Case 7 */
-static const u8 K7[24],
-		*P7=NULL,
-		*A7=NULL,
-		IV7[12],
-		*C7=NULL,
-		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL;
+static const u8 T7[] = {
+    0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b,
+    0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35
+};
 
 /* Test Case 8 */
-#define K8 K7
-#define IV8 IV7
-#define A8 A7
-static const u8	P8[16],
-		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
-		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+# define K8 K7
+# define IV8 IV7
+# define A8 A7
+static const u8 P8[16];
+static const u8 C8[] = {
+    0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41,
+    0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00
+};
+
+static const u8 T8[] = {
+    0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab,
+    0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb
+};
 
 /* Test Case 9 */
-#define A9 A8
-static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
-			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
-		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
-			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
-			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
-			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
-		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+# define A9 A8
+static const u8 K9[] = {
+    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c
+};
+
+static const u8 P9[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV9[] = {
+    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+    0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C9[] = {
+    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+    0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56
+};
+
+static const u8 T9[] = {
+    0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf,
+    0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14
+};
 
 /* Test Case 10 */
-#define K10 K9
-#define IV10 IV9
-static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-			0xab,0xad,0xda,0xd2},
-		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
-			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
-			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
-			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
-		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+# define K10 K9
+# define IV10 IV9
+static const u8 P10[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A10[] = {
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C10[] = {
+    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+    0xcc, 0xda, 0x27, 0x10
+};
+
+static const u8 T10[] = {
+    0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f,
+    0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c
+};
 
 /* Test Case 11 */
-#define K11 K10
-#define P11 P10
-#define A11 A10
-static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
-			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
-			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
-			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
-		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+# define K11 K10
+# define P11 P10
+# define A11 A10
+static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C11[] = {
+    0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54,
+    0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8,
+    0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f,
+    0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57,
+    0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75,
+    0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9,
+    0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f,
+    0xa0, 0xf0, 0x62, 0xf7
+};
+
+static const u8 T11[] = {
+    0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24,
+    0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8
+};
 
 /* Test Case 12 */
-#define K12 K11
-#define P12 P11
-#define A12 A11
-static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
-			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
-			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
-			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
-		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+# define K12 K11
+# define P12 P11
+# define A12 A11
+static const u8 IV12[] = {
+    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+    0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C12[] = {
+    0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c,
+    0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff,
+    0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef,
+    0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45,
+    0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9,
+    0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3,
+    0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7,
+    0xe9, 0xb7, 0x37, 0x3b
+};
+
+static const u8 T12[] = {
+    0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb,
+    0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9
+};
 
 /* Test Case 13 */
-static const u8	K13[32],
-		*P13=NULL,
-		*A13=NULL,
-		IV13[12],
-		*C13=NULL,
-		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL;
+static const u8 T13[] = {
+    0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9,
+    0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b
+};
 
 /* Test Case 14 */
-#define K14 K13
-#define A14 A13
-static const u8	P14[16],
-		IV14[12],
-		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
-		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+# define K14 K13
+# define A14 A13
+static const u8 P14[16], IV14[12];
+static const u8 C14[] = {
+    0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
+    0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+
+static const u8 T14[] = {
+    0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0,
+    0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19
+};
 
 /* Test Case 15 */
-#define A15 A14
-static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
-			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
-		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
-			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
-			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
-			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
-		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+# define A15 A14
+static const u8 K15[] = {
+    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P15[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV15[] = {
+    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+    0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C15[] = {
+    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T15[] = {
+    0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd,
+    0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c
+};
 
 /* Test Case 16 */
-#define K16 K15
-#define IV16 IV15
-static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-			0xab,0xad,0xda,0xd2},
-		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
-			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
-			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
-			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
-		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+# define K16 K15
+# define IV16 IV15
+static const u8 P16[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A16[] = {
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C16[] = {
+    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+    0xbc, 0xc9, 0xf6, 0x62
+};
+
+static const u8 T16[] = {
+    0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
+    0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b
+};
 
 /* Test Case 17 */
-#define K17 K16
-#define P17 P16
-#define A17 A16
-static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
-			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
-			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
-			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
-		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+# define K17 K16
+# define P17 P16
+# define A17 A16
+static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C17[] = {
+    0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32,
+    0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+    0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa,
+    0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+    0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0,
+    0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+    0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99,
+    0xf4, 0x7c, 0x9b, 0x1f
+};
+
+static const u8 T17[] = {
+    0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4,
+    0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2
+};
 
 /* Test Case 18 */
-#define K18 K17
-#define P18 P17
-#define A18 A17
-static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
-			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
-			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
-			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
-		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+# define K18 K17
+# define P18 P17
+# define A18 A17
+static const u8 IV18[] = {
+    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+    0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C18[] = {
+    0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
+    0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
+    0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
+    0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+    0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
+    0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
+    0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e,
+    0x44, 0xae, 0x7e, 0x3f
+};
+
+static const u8 T18[] = {
+    0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0,
+    0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a
+};
 
 /* Test Case 19 */
-#define K19 K1
-#define P19 P1
-#define IV19 IV1
-#define C19 C1
-static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
-			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
-			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
-			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
-			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
-		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
+# define K19 K1
+# define P19 P1
+# define IV19 IV1
+# define C19 C1
+static const u8 A19[] = {
+    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55,
+    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T19[] = {
+    0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d,
+    0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92
+};
 
 /* Test Case 20 */
-#define K20 K1
-#define A20 A1
-static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
-		P20[288],
-		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
-			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
-			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
-			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
-			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
-			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
-			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
-			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
-			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
-			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
-			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
-			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
-			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
-			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
-			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
-			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
-			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
-			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
-		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
-
-#define TEST_CASE(n)	do {					\
-	u8 out[sizeof(P##n)];					\
-	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
-	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
-	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
-	memset(out,0,sizeof(out));				\
-	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
-	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
-	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
-	    (C##n && memcmp(out,C##n,sizeof(out))))		\
-		ret++, printf ("encrypt test#%d failed.\n",n);	\
-	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
-	memset(out,0,sizeof(out));				\
-	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
-	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
-	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
-	    (P##n && memcmp(out,P##n,sizeof(out))))		\
-		ret++, printf ("decrypt test#%d failed.\n",n);	\
-	} while(0)
+# define K20 K1
+# define A20 A1
+/* this results in 0xff in counter LSB */
+static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff };
+
+static const u8 P20[288];
+static const u8 C20[] = {
+    0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a,
+    0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14,
+    0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce,
+    0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f,
+    0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70,
+    0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18,
+    0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf,
+    0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49,
+    0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab,
+    0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c,
+    0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c,
+    0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29,
+    0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1,
+    0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76,
+    0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2,
+    0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce,
+    0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f,
+    0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86,
+    0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb,
+    0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18,
+    0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65,
+    0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42,
+    0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b,
+    0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06,
+    0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24,
+    0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c,
+    0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4,
+    0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64,
+    0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03,
+    0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6,
+    0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90,
+    0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74,
+    0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67,
+    0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46,
+    0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78,
+    0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c
+};
+
+static const u8 T20[] = {
+    0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a,
+    0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f
+};
+
+# define TEST_CASE(n)    do {                                    \
+        u8 out[sizeof(P##n)];                                   \
+        AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
+        CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
+        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
+        memset(out,0,sizeof(out));                              \
+        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
+        if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
+        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
+            (C##n && memcmp(out,C##n,sizeof(out))))             \
+                ret++, printf ("encrypt test#%d failed.\n",n);  \
+        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
+        memset(out,0,sizeof(out));                              \
+        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
+        if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
+        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
+            (P##n && memcmp(out,P##n,sizeof(out))))             \
+                ret++, printf ("decrypt test#%d failed.\n",n);  \
+        } while(0)
 
 int main()
 {
-	GCM128_CONTEXT ctx;
-	AES_KEY key;
-	int ret=0;
-
-	TEST_CASE(1);
-	TEST_CASE(2);
-	TEST_CASE(3);
-	TEST_CASE(4);
-	TEST_CASE(5);
-	TEST_CASE(6);
-	TEST_CASE(7);
-	TEST_CASE(8);
-	TEST_CASE(9);
-	TEST_CASE(10);
-	TEST_CASE(11);
-	TEST_CASE(12);
-	TEST_CASE(13);
-	TEST_CASE(14);
-	TEST_CASE(15);
-	TEST_CASE(16);
-	TEST_CASE(17);
-	TEST_CASE(18);
-	TEST_CASE(19);
-	TEST_CASE(20);
-
-#ifdef OPENSSL_CPUID_OBJ
-	{
-	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
-	union { u64 u; u8 c[1024]; } buf;
-	int i;
-
-	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
-	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
-	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
-
-	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
-	start = OPENSSL_rdtsc();
-	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
-	gcm_t = OPENSSL_rdtsc() - start;
-
-	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
-			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
-			(block128_f)AES_encrypt);
-	start = OPENSSL_rdtsc();
-	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
-			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
-			(block128_f)AES_encrypt);
-	ctr_t = OPENSSL_rdtsc() - start;
-
-	printf("%.2f-%.2f=%.2f\n",
-			gcm_t/(double)sizeof(buf),
-			ctr_t/(double)sizeof(buf),
-			(gcm_t-ctr_t)/(double)sizeof(buf));
-#ifdef GHASH
-	{
-	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-				const u8 *inp,size_t len)	= ctx.ghash;
-
-	GHASH((&ctx),buf.c,sizeof(buf));
-	start = OPENSSL_rdtsc();
-	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
-	gcm_t = OPENSSL_rdtsc() - start;
-	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
-	}
-#endif
-	}
-#endif
+    GCM128_CONTEXT ctx;
+    AES_KEY key;
+    int ret = 0;
+
+    TEST_CASE(1);
+    TEST_CASE(2);
+    TEST_CASE(3);
+    TEST_CASE(4);
+    TEST_CASE(5);
+    TEST_CASE(6);
+    TEST_CASE(7);
+    TEST_CASE(8);
+    TEST_CASE(9);
+    TEST_CASE(10);
+    TEST_CASE(11);
+    TEST_CASE(12);
+    TEST_CASE(13);
+    TEST_CASE(14);
+    TEST_CASE(15);
+    TEST_CASE(16);
+    TEST_CASE(17);
+    TEST_CASE(18);
+    TEST_CASE(19);
+    TEST_CASE(20);
+
+# ifdef OPENSSL_CPUID_OBJ
+    {
+        size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc();
+        union {
+            u64 u;
+            u8 c[1024];
+        } buf;
+        int i;
+
+        AES_set_encrypt_key(K1, sizeof(K1) * 8, &key);
+        CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt);
+        CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1));
+
+        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+        start = OPENSSL_rdtsc();
+        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+        gcm_t = OPENSSL_rdtsc() - start;
+
+        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+                              (block128_f) AES_encrypt);
+        start = OPENSSL_rdtsc();
+        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+                              (block128_f) AES_encrypt);
+        ctr_t = OPENSSL_rdtsc() - start;
+
+        printf("%.2f-%.2f=%.2f\n",
+               gcm_t / (double)sizeof(buf),
+               ctr_t / (double)sizeof(buf),
+               (gcm_t - ctr_t) / (double)sizeof(buf));
+#  ifdef GHASH
+        {
+            void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+                                 const u8 *inp, size_t len) = ctx.ghash;
+
+            GHASH((&ctx), buf.c, sizeof(buf));
+            start = OPENSSL_rdtsc();
+            for (i = 0; i < 100; ++i)
+                GHASH((&ctx), buf.c, sizeof(buf));
+            gcm_t = OPENSSL_rdtsc() - start;
+            printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i);
+        }
+#  endif
+    }
+# endif
 
-	return ret;
+    return ret;
 }
 #endif
diff --git a/openssl/crypto/modes/modes.h b/openssl/crypto/modes/modes.h
index 7773c2542..fd488499a 100644
--- a/openssl/crypto/modes/modes.h
+++ b/openssl/crypto/modes/modes.h
@@ -10,132 +10,154 @@
 #ifdef  __cplusplus
 extern "C" {
 #endif
-typedef void (*block128_f)(const unsigned char in[16],
-			unsigned char out[16],
-			const void *key);
+typedef void (*block128_f) (const unsigned char in[16],
+                            unsigned char out[16], const void *key);
 
-typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], int enc);
+typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
+                          size_t len, const void *key,
+                          unsigned char ivec[16], int enc);
 
-typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
-			size_t blocks, const void *key,
-			const unsigned char ivec[16]);
+typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
+                          size_t blocks, const void *key,
+                          const unsigned char ivec[16]);
 
-typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
-			size_t blocks, const void *key,
-			const unsigned char ivec[16],unsigned char cmac[16]);
+typedef void (*ccm128_f) (const unsigned char *in, unsigned char *out,
+                          size_t blocks, const void *key,
+                          const unsigned char ivec[16],
+                          unsigned char cmac[16]);
 
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block);
 void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block);
 
 void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], unsigned char ecount_buf[16],
-			unsigned int *num, block128_f block);
+                           size_t len, const void *key,
+                           unsigned char ivec[16],
+                           unsigned char ecount_buf[16], unsigned int *num,
+                           block128_f block);
 
 void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], unsigned char ecount_buf[16],
-			unsigned int *num, ctr128_f ctr);
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16],
+                                 unsigned char ecount_buf[16],
+                                 unsigned int *num, ctr128_f ctr);
 
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], int *num,
-			block128_f block);
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num,
+                           block128_f block);
 
 void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block);
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num,
+                           int enc, block128_f block);
 void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-			size_t length, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block);
+                             size_t length, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block);
 void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-			size_t bits, const void *key,
-			unsigned char ivec[16], int *num,
-			int enc, block128_f block);
-
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                             size_t bits, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block);
+
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block);
 size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block);
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc);
-
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc);
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block);
 size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], block128_f block);
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block);
 size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], cbc128_f cbc);
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc);
 
 typedef struct gcm128_context GCM128_CONTEXT;
 
 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);
 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
-			size_t len);
+                         size_t len);
 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
-			size_t len);
+                      size_t len);
 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-			const unsigned char *in, unsigned char *out,
-			size_t len);
+                          const unsigned char *in, unsigned char *out,
+                          size_t len);
 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-			const unsigned char *in, unsigned char *out,
-			size_t len);
+                          const unsigned char *in, unsigned char *out,
+                          size_t len);
 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-			const unsigned char *in, unsigned char *out,
-			size_t len, ctr128_f stream);
+                                const unsigned char *in, unsigned char *out,
+                                size_t len, ctr128_f stream);
 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-			const unsigned char *in, unsigned char *out,
-			size_t len, ctr128_f stream);
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
-			size_t len);
+                                const unsigned char *in, unsigned char *out,
+                                size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
+                         size_t len);
 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
 
 typedef struct ccm128_context CCM128_CONTEXT;
 
 void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-	unsigned int M, unsigned int L, void *key,block128_f block);
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-	const unsigned char *nonce, size_t nlen, size_t mlen);
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-	const unsigned char *aad, size_t alen);
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out, size_t len,
-	ccm128_f stream);
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-	const unsigned char *inp, unsigned char *out, size_t len,
-	ccm128_f stream);
+                        unsigned int M, unsigned int L, void *key,
+                        block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, const unsigned char *nonce,
+                        size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, const unsigned char *aad,
+                       size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
+                          unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
+                          unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
+                                unsigned char *out, size_t len,
+                                ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
+                                unsigned char *out, size_t len,
+                                ccm128_f stream);
 size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
 
 typedef struct xts128_context XTS128_CONTEXT;
 
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-	const unsigned char *inp, unsigned char *out, size_t len, int enc);
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+                          const unsigned char iv[16],
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len, int enc);
+
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+                       unsigned char *out,
+                       const unsigned char *in, size_t inlen,
+                       block128_f block);
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+                         unsigned char *out,
+                         const unsigned char *in, size_t inlen,
+                         block128_f block);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/openssl/crypto/modes/modes_lcl.h b/openssl/crypto/modes/modes_lcl.h
index 9d83e1284..900f54ca2 100644
--- a/openssl/crypto/modes/modes_lcl.h
+++ b/openssl/crypto/modes/modes_lcl.h
@@ -7,122 +7,137 @@
 
 #include <openssl/modes.h>
 
-
 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
 typedef __int64 i64;
 typedef unsigned __int64 u64;
-#define U64(C) C##UI64
+# define U64(C) C##UI64
 #elif defined(__arch64__)
 typedef long i64;
 typedef unsigned long u64;
-#define U64(C) C##UL
+# define U64(C) C##UL
 #else
 typedef long long i64;
 typedef unsigned long long u64;
-#define U64(C) C##ULL
+# define U64(C) C##ULL
 #endif
 
 typedef unsigned int u32;
 typedef unsigned char u8;
 
 #define STRICT_ALIGNMENT 1
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)
-# undef STRICT_ALIGNMENT
+#ifndef PEDANTIC
+# if defined(__i386)    || defined(__i386__)    || \
+     defined(__x86_64)  || defined(__x86_64__)  || \
+     defined(_M_IX86)   || defined(_M_AMD64)    || defined(_M_X64) || \
+     defined(__aarch64__)                       || \
+     defined(__s390__)  || defined(__s390x__)
+#  undef STRICT_ALIGNMENT
+# endif
 #endif
 
 #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#if defined(__GNUC__) && __GNUC__>=2
-# if defined(__x86_64) || defined(__x86_64__)
-#  define BSWAP8(x) ({	u64 ret=(x);			\
-			asm ("bswapq %0"		\
-			: "+r"(ret));	ret;		})
-#  define BSWAP4(x) ({	u32 ret=(x);			\
-			asm ("bswapl %0"		\
-			: "+r"(ret));	ret;		})
-# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
-#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
-			asm ("bswapl %0; bswapl %1"	\
-			: "+r"(hi),"+r"(lo));		\
-			(u64)hi<<32|lo;			})
-#  define BSWAP4(x) ({	u32 ret=(x);			\
-			asm ("bswapl %0"		\
-			: "+r"(ret));	ret;		})
-# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
-#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
-			asm ("rev %0,%0; rev %1,%1"	\
-			: "+r"(hi),"+r"(lo));		\
-			(u64)hi<<32|lo;			})
-#  define BSWAP4(x) ({	u32 ret;			\
-			asm ("rev %0,%1"		\
-			: "=r"(ret) : "r"((u32)(x)));	\
-			ret;				})
+# if defined(__GNUC__) && __GNUC__>=2
+#  if defined(__x86_64) || defined(__x86_64__)
+#   define BSWAP8(x) ({ u64 ret=(x);                    \
+                        asm ("bswapq %0"                \
+                        : "+r"(ret));   ret;            })
+#   define BSWAP4(x) ({ u32 ret=(x);                    \
+                        asm ("bswapl %0"                \
+                        : "+r"(ret));   ret;            })
+#  elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+#   define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x);     \
+                        asm ("bswapl %0; bswapl %1"     \
+                        : "+r"(hi),"+r"(lo));           \
+                        (u64)hi<<32|lo;                 })
+#   define BSWAP4(x) ({ u32 ret=(x);                    \
+                        asm ("bswapl %0"                \
+                        : "+r"(ret));   ret;            })
+#  elif defined(__aarch64__)
+#   define BSWAP8(x) ({ u64 ret;                        \
+                        asm ("rev %0,%1"                \
+                        : "=r"(ret) : "r"(x)); ret;     })
+#   define BSWAP4(x) ({ u32 ret;                        \
+                        asm ("rev %w0,%w1"              \
+                        : "=r"(ret) : "r"(x)); ret;     })
+#  elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#   define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x);     \
+                        asm ("rev %0,%0; rev %1,%1"     \
+                        : "+r"(hi),"+r"(lo));           \
+                        (u64)hi<<32|lo;                 })
+#   define BSWAP4(x) ({ u32 ret;                        \
+                        asm ("rev %0,%1"                \
+                        : "=r"(ret) : "r"((u32)(x)));   \
+                        ret;                            })
+#  endif
+# elif defined(_MSC_VER)
+#  if _MSC_VER>=1300
+#   pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#   define BSWAP8(x)    _byteswap_uint64((u64)(x))
+#   define BSWAP4(x)    _byteswap_ulong((u32)(x))
+#  elif defined(_M_IX86)
+__inline u32 _bswap4(u32 val)
+{
+_asm mov eax, val _asm bswap eax}
+#   define BSWAP4(x)    _bswap4(x)
+#  endif
 # endif
-#elif defined(_MSC_VER)
-# if _MSC_VER>=1300
-#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
-#  define BSWAP8(x)	_byteswap_uint64((u64)(x))
-#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
-# elif defined(_M_IX86)
-   __inline u32 _bswap4(u32 val) {
-	_asm mov eax,val
-	_asm bswap eax
-   }
-#  define BSWAP4(x)	_bswap4(x)
-# endif
-#endif
 #endif
-
 #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
-#define GETU32(p)	BSWAP4(*(const u32 *)(p))
-#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+# define GETU32(p)       BSWAP4(*(const u32 *)(p))
+# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
 #else
-#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+# define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+# define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
 #endif
+/*- GCM definitions */ typedef struct {
+    u64 hi, lo;
+} u128;
 
-/* GCM definitions */
-
-typedef struct { u64 hi,lo; } u128;
-
-#ifdef	TABLE_BITS
-#undef	TABLE_BITS
+#ifdef  TABLE_BITS
+# undef  TABLE_BITS
 #endif
 /*
  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  * never be set to 8 [or 1]. For further information see gcm128.c.
  */
-#define	TABLE_BITS 4
+#define TABLE_BITS 4
 
 struct gcm128_context {
-	/* Following 6 names follow names in GCM specification */
-	union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
-	  Yi,EKi,EK0,len,Xi,H;
-	/* Relative position of Xi, H and pre-computed Htable is used
-	 * in some assembler modules, i.e. don't change the order! */
+    /* Following 6 names follow names in GCM specification */
+    union {
+        u64 u[2];
+        u32 d[4];
+        u8 c[16];
+        size_t t[16 / sizeof(size_t)];
+    } Yi, EKi, EK0, len, Xi, H;
+    /*
+     * Relative position of Xi, H and pre-computed Htable is used in some
+     * assembler modules, i.e. don't change the order!
+     */
 #if TABLE_BITS==8
-	u128 Htable[256];
+    u128 Htable[256];
 #else
-	u128 Htable[16];
-	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
-	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+    u128 Htable[16];
+    void (*gmult) (u64 Xi[2], const u128 Htable[16]);
+    void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                   size_t len);
 #endif
-	unsigned int mres, ares;
-	block128_f block;
-	void *key;
+    unsigned int mres, ares;
+    block128_f block;
+    void *key;
 };
 
 struct xts128_context {
-	void      *key1, *key2;
-	block128_f block1,block2;
+    void *key1, *key2;
+    block128_f block1, block2;
 };
 
 struct ccm128_context {
-	union { u64 u[2]; u8 c[16]; } nonce, cmac;
-	u64 blocks;
-	block128_f block;
-	void *key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } nonce, cmac;
+    u64 blocks;
+    block128_f block;
+    void *key;
 };
-
diff --git a/openssl/crypto/modes/ofb128.c b/openssl/crypto/modes/ofb128.c
index 01c01702c..4dbaccd7a 100644
--- a/openssl/crypto/modes/ofb128.c
+++ b/openssl/crypto/modes/ofb128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -59,63 +59,66 @@
 #endif
 #include <assert.h>
 
-/* The input and output encrypted as though 128bit ofb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
+/*
+ * The input and output encrypted as though 128bit ofb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
  */
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-			size_t len, const void *key,
-			unsigned char ivec[16], int *num,
-			block128_f block)
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num, block128_f block)
 {
-	unsigned int n;
-	size_t l=0;
+    unsigned int n;
+    size_t l = 0;
 
-	assert(in && out && key && ivec && num);
+    assert(in && out && key && ivec && num);
 
-	n = *num;
+    n = *num;
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16%sizeof(size_t) == 0) do { /* always true actually */
-		while (n && len) {
-			*(out++) = *(in++) ^ ivec[n];
-			--len;
-			n = (n+1) % 16;
-		}
-#if defined(STRICT_ALIGNMENT)
-		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-			break;
-#endif
-		while (len>=16) {
-			(*block)(ivec, ivec, key);
-			for (; n<16; n+=sizeof(size_t))
-				*(size_t*)(out+n) =
-				*(size_t*)(in+n) ^ *(size_t*)(ivec+n);
-			len -= 16;
-			out += 16;
-			in  += 16;
-			n = 0;
-		}
-		if (len) {
-			(*block)(ivec, ivec, key);
-			while (len--) {
-				out[n] = in[n] ^ ivec[n];
-				++n;
-			}
-		}
-		*num = n;
-		return;
-	} while(0);
-	/* the rest would be commonly eliminated by x86* compiler */
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            while (n && len) {
+                *(out++) = *(in++) ^ ivec[n];
+                --len;
+                n = (n + 1) % 16;
+            }
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+                0)
+                break;
+# endif
+            while (len >= 16) {
+                (*block) (ivec, ivec, key);
+                for (; n < 16; n += sizeof(size_t))
+                    *(size_t *)(out + n) =
+                        *(size_t *)(in + n) ^ *(size_t *)(ivec + n);
+                len -= 16;
+                out += 16;
+                in += 16;
+                n = 0;
+            }
+            if (len) {
+                (*block) (ivec, ivec, key);
+                while (len--) {
+                    out[n] = in[n] ^ ivec[n];
+                    ++n;
+                }
+            }
+            *num = n;
+            return;
+        } while (0);
+    }
+    /* the rest would be commonly eliminated by x86* compiler */
 #endif
-	while (l<len) {
-		if (n==0) {
-			(*block)(ivec, ivec, key);
-		}
-		out[l] = in[l] ^ ivec[n];
-		++l;
-		n = (n+1) % 16;
-	}
+    while (l < len) {
+        if (n == 0) {
+            (*block) (ivec, ivec, key);
+        }
+        out[l] = in[l] ^ ivec[n];
+        ++l;
+        n = (n + 1) % 16;
+    }
 
-	*num=n;
+    *num = n;
 }
diff --git a/openssl/crypto/modes/wrap128.c b/openssl/crypto/modes/wrap128.c
new file mode 100755
index 000000000..4dcaf0326
--- /dev/null
+++ b/openssl/crypto/modes/wrap128.c
@@ -0,0 +1,138 @@
+/* crypto/modes/wrap128.c */
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/modes.h>
+
+static const unsigned char default_iv[] = {
+    0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
+};
+
+/*
+ * Input size limit: lower than maximum of standards but far larger than
+ * anything that will be used in practice.
+ */
+#define CRYPTO128_WRAP_MAX (1UL << 31)
+
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+                       unsigned char *out,
+                       const unsigned char *in, size_t inlen,
+                       block128_f block)
+{
+    unsigned char *A, B[16], *R;
+    size_t i, j, t;
+    if ((inlen & 0x7) || (inlen < 8) || (inlen > CRYPTO128_WRAP_MAX))
+        return 0;
+    A = B;
+    t = 1;
+    memcpy(out + 8, in, inlen);
+    if (!iv)
+        iv = default_iv;
+
+    memcpy(A, iv, 8);
+
+    for (j = 0; j < 6; j++) {
+        R = out + 8;
+        for (i = 0; i < inlen; i += 8, t++, R += 8) {
+            memcpy(B + 8, R, 8);
+            block(B, B, key);
+            A[7] ^= (unsigned char)(t & 0xff);
+            if (t > 0xff) {
+                A[6] ^= (unsigned char)((t >> 8) & 0xff);
+                A[5] ^= (unsigned char)((t >> 16) & 0xff);
+                A[4] ^= (unsigned char)((t >> 24) & 0xff);
+            }
+            memcpy(R, B + 8, 8);
+        }
+    }
+    memcpy(out, A, 8);
+    return inlen + 8;
+}
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+                         unsigned char *out,
+                         const unsigned char *in, size_t inlen,
+                         block128_f block)
+{
+    unsigned char *A, B[16], *R;
+    size_t i, j, t;
+    inlen -= 8;
+    if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+        return 0;
+    A = B;
+    t = 6 * (inlen >> 3);
+    memcpy(A, in, 8);
+    memcpy(out, in + 8, inlen);
+    for (j = 0; j < 6; j++) {
+        R = out + inlen - 8;
+        for (i = 0; i < inlen; i += 8, t--, R -= 8) {
+            A[7] ^= (unsigned char)(t & 0xff);
+            if (t > 0xff) {
+                A[6] ^= (unsigned char)((t >> 8) & 0xff);
+                A[5] ^= (unsigned char)((t >> 16) & 0xff);
+                A[4] ^= (unsigned char)((t >> 24) & 0xff);
+            }
+            memcpy(B + 8, R, 8);
+            block(B, B, key);
+            memcpy(R, B + 8, 8);
+        }
+    }
+    if (!iv)
+        iv = default_iv;
+    if (memcmp(A, iv, 8)) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+    return inlen;
+}
diff --git a/openssl/crypto/modes/xts128.c b/openssl/crypto/modes/xts128.c
index 9cf27a25e..8f2af588b 100644
--- a/openssl/crypto/modes/xts128.c
+++ b/openssl/crypto/modes/xts128.c
@@ -6,7 +6,7 @@
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -58,130 +58,147 @@
 #endif
 #include <assert.h>
 
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-	const unsigned char *inp, unsigned char *out,
-	size_t len, int enc)
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+                          const unsigned char iv[16],
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len, int enc)
 {
-	const union { long one; char little; } is_endian = {1};
-	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
-	unsigned int i;
-
-	if (len<16) return -1;
-
-	memcpy(tweak.c, iv, 16);
-
-	(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
-
-	if (!enc && (len%16)) len-=16;
-
-	while (len>=16) {
+    const union {
+        long one;
+        char little;
+    } is_endian = {
+        1
+    };
+    union {
+        u64 u[2];
+        u32 d[4];
+        u8 c[16];
+    } tweak, scratch;
+    unsigned int i;
+
+    if (len < 16)
+        return -1;
+
+    memcpy(tweak.c, iv, 16);
+
+    (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
+
+    if (!enc && (len % 16))
+        len -= 16;
+
+    while (len >= 16) {
 #if defined(STRICT_ALIGNMENT)
-		memcpy(scratch.c,inp,16);
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
 #else
-		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
-		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+        scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
+        scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
 #endif
-		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
 #if defined(STRICT_ALIGNMENT)
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy(out,scratch.c,16);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
 #else
-		((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
-		((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+        ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
+        ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
 #endif
-		inp += 16;
-		out += 16;
-		len -= 16;
-
-		if (len==0)	return 0;
-
-		if (is_endian.little) {
-			unsigned int carry,res;
-			
-			res = 0x87&(((int)tweak.d[3])>>31);
-			carry = (unsigned int)(tweak.u[0]>>63);
-			tweak.u[0] = (tweak.u[0]<<1)^res;
-			tweak.u[1] = (tweak.u[1]<<1)|carry;
-		}
-		else {
-			size_t c;
-
-			for (c=0,i=0;i<16;++i) {
-				/*+ substitutes for |, because c is 1 bit */ 
-				c += ((size_t)tweak.c[i])<<1;
-				tweak.c[i] = (u8)c;
-				c = c>>8;
-			}
-			tweak.c[0] ^= (u8)(0x87&(0-c));
-		}
-	}
-	if (enc) {
-		for (i=0;i<len;++i) {
-			u8 c = inp[i];
-			out[i] = scratch.c[i];
-			scratch.c[i] = c;
-		}
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy(out-16,scratch.c,16);
-	}
-	else {
-		union { u64 u[2]; u8 c[16]; } tweak1;
-
-		if (is_endian.little) {
-			unsigned int carry,res;
-
-			res = 0x87&(((int)tweak.d[3])>>31);
-			carry = (unsigned int)(tweak.u[0]>>63);
-			tweak1.u[0] = (tweak.u[0]<<1)^res;
-			tweak1.u[1] = (tweak.u[1]<<1)|carry;
-		}
-		else {
-			size_t c;
-
-			for (c=0,i=0;i<16;++i) {
-				/*+ substitutes for |, because c is 1 bit */ 
-				c += ((size_t)tweak.c[i])<<1;
-				tweak1.c[i] = (u8)c;
-				c = c>>8;
-			}
-			tweak1.c[0] ^= (u8)(0x87&(0-c));
-		}
+        inp += 16;
+        out += 16;
+        len -= 16;
+
+        if (len == 0)
+            return 0;
+
+        if (is_endian.little) {
+            unsigned int carry, res;
+
+            res = 0x87 & (((int)tweak.d[3]) >> 31);
+            carry = (unsigned int)(tweak.u[0] >> 63);
+            tweak.u[0] = (tweak.u[0] << 1) ^ res;
+            tweak.u[1] = (tweak.u[1] << 1) | carry;
+        } else {
+            size_t c;
+
+            for (c = 0, i = 0; i < 16; ++i) {
+                /*
+                 * + substitutes for |, because c is 1 bit
+                 */
+                c += ((size_t)tweak.c[i]) << 1;
+                tweak.c[i] = (u8)c;
+                c = c >> 8;
+            }
+            tweak.c[0] ^= (u8)(0x87 & (0 - c));
+        }
+    }
+    if (enc) {
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[i];
+            out[i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out - 16, scratch.c, 16);
+    } else {
+        union {
+            u64 u[2];
+            u8 c[16];
+        } tweak1;
+
+        if (is_endian.little) {
+            unsigned int carry, res;
+
+            res = 0x87 & (((int)tweak.d[3]) >> 31);
+            carry = (unsigned int)(tweak.u[0] >> 63);
+            tweak1.u[0] = (tweak.u[0] << 1) ^ res;
+            tweak1.u[1] = (tweak.u[1] << 1) | carry;
+        } else {
+            size_t c;
+
+            for (c = 0, i = 0; i < 16; ++i) {
+                /*
+                 * + substitutes for |, because c is 1 bit
+                 */
+                c += ((size_t)tweak.c[i]) << 1;
+                tweak1.c[i] = (u8)c;
+                c = c >> 8;
+            }
+            tweak1.c[0] ^= (u8)(0x87 & (0 - c));
+        }
 #if defined(STRICT_ALIGNMENT)
-		memcpy(scratch.c,inp,16);
-		scratch.u[0] ^= tweak1.u[0];
-		scratch.u[1] ^= tweak1.u[1];
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
 #else
-		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
-		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+        scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
+        scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
 #endif
-		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-		scratch.u[0] ^= tweak1.u[0];
-		scratch.u[1] ^= tweak1.u[1];
-
-		for (i=0;i<len;++i) {
-			u8 c = inp[16+i];
-			out[16+i] = scratch.c[i];
-			scratch.c[i] = c;
-		}
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
+
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[16 + i];
+            out[16 + i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
 #if defined(STRICT_ALIGNMENT)
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy (out,scratch.c,16);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
 #else
-		((u64*)out)[0] = scratch.u[0]^tweak.u[0];
-		((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+        ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
+        ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
 #endif
-	}
+    }
 
-	return 0;
+    return 0;
 }