diff options
| author | marha <marha@users.sourceforge.net> | 2015-02-22 21:39:56 +0100 | 
|---|---|---|
| committer | marha <marha@users.sourceforge.net> | 2015-02-22 21:39:56 +0100 | 
| commit | 462f18c7b25fe3e467f837647d07ab0a78aa8d2b (patch) | |
| tree | fc8013c0a1bac05a1945846c1697e973f4c35013 /openssl/crypto/modes | |
| parent | 36f711ee12b6dd5184198abed3aa551efb585587 (diff) | |
| download | vcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.tar.gz vcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.tar.bz2 vcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.zip | |
Merged origin/release (checked in because wanted to merge new stuff)
Diffstat (limited to 'openssl/crypto/modes')
| -rw-r--r-- | openssl/crypto/modes/Makefile | 24 | ||||
| -rwxr-xr-x | openssl/crypto/modes/asm/aesni-gcm-x86_64.pl | 1057 | ||||
| -rw-r--r-- | openssl/crypto/modes/asm/ghash-armv4.pl | 232 | ||||
| -rw-r--r-- | openssl/crypto/modes/asm/ghash-s390x.pl | 6 | ||||
| -rw-r--r-- | openssl/crypto/modes/asm/ghash-sparcv9.pl | 247 | ||||
| -rw-r--r-- | openssl/crypto/modes/asm/ghash-x86.pl | 199 | ||||
| -rw-r--r-- | openssl/crypto/modes/asm/ghash-x86_64.pl | 1149 | ||||
| -rwxr-xr-x | openssl/crypto/modes/asm/ghashp8-ppc.pl | 234 | ||||
| -rwxr-xr-x | openssl/crypto/modes/asm/ghashv8-armx.pl | 241 | ||||
| -rw-r--r-- | openssl/crypto/modes/cbc128.c | 252 | ||||
| -rw-r--r-- | openssl/crypto/modes/ccm128.c | 682 | ||||
| -rw-r--r-- | openssl/crypto/modes/cfb128.c | 292 | ||||
| -rw-r--r-- | openssl/crypto/modes/ctr128.c | 354 | ||||
| -rw-r--r-- | openssl/crypto/modes/cts128.c | 707 | ||||
| -rw-r--r-- | openssl/crypto/modes/gcm128.c | 3478 | ||||
| -rw-r--r-- | openssl/crypto/modes/modes.h | 194 | ||||
| -rw-r--r-- | openssl/crypto/modes/modes_lcl.h | 171 | ||||
| -rw-r--r-- | openssl/crypto/modes/ofb128.c | 105 | ||||
| -rwxr-xr-x | openssl/crypto/modes/wrap128.c | 138 | ||||
| -rw-r--r-- | openssl/crypto/modes/xts128.c | 243 | 
20 files changed, 6834 insertions, 3171 deletions
| diff --git a/openssl/crypto/modes/Makefile b/openssl/crypto/modes/Makefile index 3d8bafd57..cbcbfad4b 100644 --- a/openssl/crypto/modes/Makefile +++ b/openssl/crypto/modes/Makefile @@ -22,9 +22,9 @@ APPS=  LIB=$(TOP)/libcrypto.a  LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ -	ccm128.c xts128.c +	ccm128.c xts128.c wrap128.c  LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \ -	ccm128.o xts128.o $(MODES_ASM_OBJ) +	ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)  SRC= $(LIBSRC) @@ -50,20 +50,26 @@ ghash-x86.s:	asm/ghash-x86.pl  	$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@  ghash-x86_64.s:	asm/ghash-x86_64.pl  	$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-gcm-x86_64.s:	asm/aesni-gcm-x86_64.pl +	$(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@  ghash-sparcv9.s:	asm/ghash-sparcv9.pl  	$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)  ghash-alpha.s:	asm/ghash-alpha.pl -	(preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \ +	(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \  	$(PERL) asm/ghash-alpha.pl > $$preproc && \ -	$(CC) -E $$preproc > $@ && rm $$preproc) - +	$(CC) -E -P $$preproc > $@ && rm $$preproc)  ghash-parisc.s:	asm/ghash-parisc.pl  	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ +ghashv8-armx.S:	asm/ghashv8-armx.pl +	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@ +ghashp8-ppc.s:	asm/ghashp8-ppc.pl +	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@  # GNU make "catch all"  ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@  ghash-armv4.o:	ghash-armv4.S +ghashv8-armx.o:	ghashv8-armx.S  files:  	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -137,6 +143,14 @@ ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h  ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h  ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h  ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c +wrap128.o: ../../e_os.h ../../include/openssl/bio.h +wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h +wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h +wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h +wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h +wrap128.o: ../cryptlib.h wrap128.c  xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h  xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h  xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl new file mode 100755 index 000000000..7e4e04ea2 --- /dev/null +++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -0,0 +1,1057 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, and 0.74 - +# on Broadwell. [Mentioned results are raw profiled measurements for +# favourable packet size, one divisible by 96. Applications using the +# EVP interface will observe a few percent worse performance.] +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +$flavour = shift; +$output  = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` +		=~ /GNU assembler version ([2-9]\.[0-9]+)/) { +	$avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && +	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { +	$avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && +	    `ml64 2>&1` =~ /Version ([0-9]+)\./) { +	$avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +	$avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($avx>1) {{{ + +($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +($Ii,$T1,$T2,$Hkey, + $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); + +($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); + +($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); + +$code=<<___; +.text + +.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent +.align	32 +_aesni_ctr32_ghash_6x: +	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb +	sub		\$6,$len +	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0 +	vmovdqu		0x00-0x80($key),$rndkey +	vpaddb		$T2,$T1,$inout1 +	vpaddb		$T2,$inout1,$inout2 +	vpaddb		$T2,$inout2,$inout3 +	vpaddb		$T2,$inout3,$inout4 +	vpaddb		$T2,$inout4,$inout5 +	vpxor		$rndkey,$T1,$inout0 +	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0 +	jmp		.Loop6x + +.align	32 +.Loop6x: +	add		\$`6<<24`,$counter +	jc		.Lhandle_ctr32		# discard $inout[1-5]? +	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1 +	  vpaddb	$T2,$inout5,$T1		# next counter value +	  vpxor		$rndkey,$inout1,$inout1 +	  vpxor		$rndkey,$inout2,$inout2 + +.Lresume_ctr32: +	vmovdqu		$T1,($ivp)		# save next counter value +	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1 +	  vpxor		$rndkey,$inout3,$inout3 +	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey +	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2 +	xor		%r12,%r12 +	cmp		$in0,$end0 + +	  vaesenc	$T2,$inout0,$inout0 +	vmovdqu		0x30+8(%rsp),$Ii	# I[4] +	  vpxor		$rndkey,$inout4,$inout4 +	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1 +	  vaesenc	$T2,$inout1,$inout1 +	  vpxor		$rndkey,$inout5,$inout5 +	setnc		%r12b +	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3 +	  vaesenc	$T2,$inout2,$inout2 +	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2 +	neg		%r12 +	  vaesenc	$T2,$inout3,$inout3 +	 vpxor		$Z1,$Z2,$Z2 +	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1 +	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled +	  vaesenc	$T2,$inout4,$inout4 +	 vpxor		$Z1,$T1,$Z0 +	and		\$0x60,%r12 +	  vmovups	0x20-0x80($key),$rndkey +	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1 +	  vaesenc	$T2,$inout5,$inout5 + +	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2 +	lea		($in0,%r12),$in0 +	  vaesenc	$rndkey,$inout0,$inout0 +	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi] +	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey +	 vmovdqu	0x40+8(%rsp),$Ii	# I[3] +	  vaesenc	$rndkey,$inout1,$inout1 +	movbe		0x58($in0),%r13 +	  vaesenc	$rndkey,$inout2,$inout2 +	movbe		0x50($in0),%r12 +	  vaesenc	$rndkey,$inout3,$inout3 +	mov		%r13,0x20+8(%rsp) +	  vaesenc	$rndkey,$inout4,$inout4 +	mov		%r12,0x28+8(%rsp) +	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vmovups	0x30-0x80($key),$rndkey +	 vpxor		$T1,$Z2,$Z2 +	vpclmulqdq	\$0x00,$Z1,$Ii,$T1 +	  vaesenc	$rndkey,$inout0,$inout0 +	 vpxor		$T2,$Z2,$Z2 +	vpclmulqdq	\$0x10,$Z1,$Ii,$T2 +	  vaesenc	$rndkey,$inout1,$inout1 +	 vpxor		$Hkey,$Z3,$Z3 +	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey +	  vaesenc	$rndkey,$inout2,$inout2 +	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1 +	 vmovdqu	0x50+8(%rsp),$Ii	# I[2] +	  vaesenc	$rndkey,$inout3,$inout3 +	  vaesenc	$rndkey,$inout4,$inout4 +	 vpxor		$T1,$Z0,$Z0 +	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vmovups	0x40-0x80($key),$rndkey +	 vpxor		$T2,$Z2,$Z2 +	vpclmulqdq	\$0x00,$T1,$Ii,$T2 +	  vaesenc	$rndkey,$inout0,$inout0 +	 vpxor		$Hkey,$Z2,$Z2 +	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey +	  vaesenc	$rndkey,$inout1,$inout1 +	movbe		0x48($in0),%r13 +	 vpxor		$Z1,$Z3,$Z3 +	vpclmulqdq	\$0x01,$T1,$Ii,$Z1 +	  vaesenc	$rndkey,$inout2,$inout2 +	movbe		0x40($in0),%r12 +	vpclmulqdq	\$0x11,$T1,$Ii,$T1 +	 vmovdqu	0x60+8(%rsp),$Ii	# I[1] +	  vaesenc	$rndkey,$inout3,$inout3 +	mov		%r13,0x30+8(%rsp) +	  vaesenc	$rndkey,$inout4,$inout4 +	mov		%r12,0x38+8(%rsp) +	 vpxor		$T2,$Z0,$Z0 +	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vmovups	0x50-0x80($key),$rndkey +	 vpxor		$Hkey,$Z2,$Z2 +	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey +	  vaesenc	$rndkey,$inout0,$inout0 +	 vpxor		$Z1,$Z2,$Z2 +	vpclmulqdq	\$0x10,$T2,$Ii,$Z1 +	  vaesenc	$rndkey,$inout1,$inout1 +	movbe		0x38($in0),%r13 +	 vpxor		$T1,$Z3,$Z3 +	vpclmulqdq	\$0x01,$T2,$Ii,$T1 +	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0] +	  vaesenc	$rndkey,$inout2,$inout2 +	movbe		0x30($in0),%r12 +	vpclmulqdq	\$0x11,$T2,$Ii,$T2 +	  vaesenc	$rndkey,$inout3,$inout3 +	mov		%r13,0x40+8(%rsp) +	  vaesenc	$rndkey,$inout4,$inout4 +	mov		%r12,0x48+8(%rsp) +	 vpxor		$Hkey,$Z0,$Z0 +	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vmovups	0x60-0x80($key),$rndkey +	 vpxor		$Z1,$Z2,$Z2 +	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1 +	  vaesenc	$rndkey,$inout0,$inout0 +	 vpxor		$T1,$Z2,$Z2 +	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1 +	  vaesenc	$rndkey,$inout1,$inout1 +	movbe		0x28($in0),%r13 +	 vpxor		$T2,$Z3,$Z3 +	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2 +	  vaesenc	$rndkey,$inout2,$inout2 +	movbe		0x20($in0),%r12 +	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi +	  vaesenc	$rndkey,$inout3,$inout3 +	mov		%r13,0x50+8(%rsp) +	  vaesenc	$rndkey,$inout4,$inout4 +	mov		%r12,0x58+8(%rsp) +	vpxor		$Z1,$Z2,$Z2 +	  vaesenc	$rndkey,$inout5,$inout5 +	vpxor		$T1,$Z2,$Z2 + +	  vmovups	0x70-0x80($key),$rndkey +	vpslldq		\$8,$Z2,$Z1 +	vpxor		$T2,$Z0,$Z0 +	vmovdqu		0x10($const),$Hkey	# .Lpoly + +	  vaesenc	$rndkey,$inout0,$inout0 +	vpxor		$Xi,$Z3,$Z3 +	  vaesenc	$rndkey,$inout1,$inout1 +	vpxor		$Z1,$Z0,$Z0 +	movbe		0x18($in0),%r13 +	  vaesenc	$rndkey,$inout2,$inout2 +	movbe		0x10($in0),%r12 +	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase +	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0 +	mov		%r13,0x60+8(%rsp) +	  vaesenc	$rndkey,$inout3,$inout3 +	mov		%r12,0x68+8(%rsp) +	  vaesenc	$rndkey,$inout4,$inout4 +	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vaesenc	$T1,$inout0,$inout0 +	  vmovups	0x90-0x80($key),$rndkey +	  vaesenc	$T1,$inout1,$inout1 +	vpsrldq		\$8,$Z2,$Z2 +	  vaesenc	$T1,$inout2,$inout2 +	vpxor		$Z2,$Z3,$Z3 +	  vaesenc	$T1,$inout3,$inout3 +	vpxor		$Ii,$Z0,$Z0 +	movbe		0x08($in0),%r13 +	  vaesenc	$T1,$inout4,$inout4 +	movbe		0x00($in0),%r12 +	  vaesenc	$T1,$inout5,$inout5 +	  vmovups	0xa0-0x80($key),$T1 +	  cmp		\$11,$rounds +	  jb		.Lenc_tail		# 128-bit key + +	  vaesenc	$rndkey,$inout0,$inout0 +	  vaesenc	$rndkey,$inout1,$inout1 +	  vaesenc	$rndkey,$inout2,$inout2 +	  vaesenc	$rndkey,$inout3,$inout3 +	  vaesenc	$rndkey,$inout4,$inout4 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vaesenc	$T1,$inout0,$inout0 +	  vaesenc	$T1,$inout1,$inout1 +	  vaesenc	$T1,$inout2,$inout2 +	  vaesenc	$T1,$inout3,$inout3 +	  vaesenc	$T1,$inout4,$inout4 +	  vmovups	0xb0-0x80($key),$rndkey +	  vaesenc	$T1,$inout5,$inout5 +	  vmovups	0xc0-0x80($key),$T1 +	  je		.Lenc_tail		# 192-bit key + +	  vaesenc	$rndkey,$inout0,$inout0 +	  vaesenc	$rndkey,$inout1,$inout1 +	  vaesenc	$rndkey,$inout2,$inout2 +	  vaesenc	$rndkey,$inout3,$inout3 +	  vaesenc	$rndkey,$inout4,$inout4 +	  vaesenc	$rndkey,$inout5,$inout5 + +	  vaesenc	$T1,$inout0,$inout0 +	  vaesenc	$T1,$inout1,$inout1 +	  vaesenc	$T1,$inout2,$inout2 +	  vaesenc	$T1,$inout3,$inout3 +	  vaesenc	$T1,$inout4,$inout4 +	  vmovups	0xd0-0x80($key),$rndkey +	  vaesenc	$T1,$inout5,$inout5 +	  vmovups	0xe0-0x80($key),$T1 +	  jmp		.Lenc_tail		# 256-bit key + +.align	32 +.Lhandle_ctr32: +	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask +	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter +	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb +	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb +	  vpaddd	$Z1,$Z2,$inout2 +	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1 +	  vpaddd	$Z1,$inout1,$inout3 +	  vpshufb	$Ii,$inout1,$inout1 +	  vpaddd	$Z1,$inout2,$inout4 +	  vpshufb	$Ii,$inout2,$inout2 +	  vpxor		$rndkey,$inout1,$inout1 +	  vpaddd	$Z1,$inout3,$inout5 +	  vpshufb	$Ii,$inout3,$inout3 +	  vpxor		$rndkey,$inout2,$inout2 +	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value +	  vpshufb	$Ii,$inout4,$inout4 +	  vpshufb	$Ii,$inout5,$inout5 +	  vpshufb	$Ii,$T1,$T1		# next counter value +	jmp		.Lresume_ctr32 + +.align	32 +.Lenc_tail: +	  vaesenc	$rndkey,$inout0,$inout0 +	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi +	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase +	  vaesenc	$rndkey,$inout1,$inout1 +	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0 +	  vpxor		0x00($inp),$T1,$T2 +	  vaesenc	$rndkey,$inout2,$inout2 +	  vpxor		0x10($inp),$T1,$Ii +	  vaesenc	$rndkey,$inout3,$inout3 +	  vpxor		0x20($inp),$T1,$Z1 +	  vaesenc	$rndkey,$inout4,$inout4 +	  vpxor		0x30($inp),$T1,$Z2 +	  vaesenc	$rndkey,$inout5,$inout5 +	  vpxor		0x40($inp),$T1,$Z3 +	  vpxor		0x50($inp),$T1,$Hkey +	  vmovdqu	($ivp),$T1		# load next counter value + +	  vaesenclast	$T2,$inout0,$inout0 +	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb +	  vaesenclast	$Ii,$inout1,$inout1 +	 vpaddb		$T2,$T1,$Ii +	mov		%r13,0x70+8(%rsp) +	lea		0x60($inp),$inp +	  vaesenclast	$Z1,$inout2,$inout2 +	 vpaddb		$T2,$Ii,$Z1 +	mov		%r12,0x78+8(%rsp) +	lea		0x60($out),$out +	  vmovdqu	0x00-0x80($key),$rndkey +	  vaesenclast	$Z2,$inout3,$inout3 +	 vpaddb		$T2,$Z1,$Z2 +	  vaesenclast	$Z3, $inout4,$inout4 +	 vpaddb		$T2,$Z2,$Z3 +	  vaesenclast	$Hkey,$inout5,$inout5 +	 vpaddb		$T2,$Z3,$Hkey + +	add		\$0x60,$ret +	sub		\$0x6,$len +	jc		.L6x_done + +	  vmovups	$inout0,-0x60($out)	# save output +	 vpxor		$rndkey,$T1,$inout0 +	  vmovups	$inout1,-0x50($out) +	 vmovdqa	$Ii,$inout1		# 0 latency +	  vmovups	$inout2,-0x40($out) +	 vmovdqa	$Z1,$inout2		# 0 latency +	  vmovups	$inout3,-0x30($out) +	 vmovdqa	$Z2,$inout3		# 0 latency +	  vmovups	$inout4,-0x20($out) +	 vmovdqa	$Z3,$inout4		# 0 latency +	  vmovups	$inout5,-0x10($out) +	 vmovdqa	$Hkey,$inout5		# 0 latency +	vmovdqu		0x20+8(%rsp),$Z3	# I[5] +	jmp		.Loop6x + +.L6x_done: +	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled +	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled + +	ret +.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +___ +###################################################################### +# +# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, +#		const AES_KEY *key, unsigned char iv[16], +#		struct { u128 Xi,H,Htbl[9]; } *Xip); +$code.=<<___; +.globl	aesni_gcm_decrypt +.type	aesni_gcm_decrypt,\@function,6 +.align	32 +aesni_gcm_decrypt: +	xor	$ret,$ret +	cmp	\$0x60,$len			# minimal accepted length +	jb	.Lgcm_dec_abort + +	lea	(%rsp),%rax			# save stack pointer +	push	%rbx +	push	%rbp +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 +___ +$code.=<<___ if ($win64); +	lea	-0xa8(%rsp),%rsp +	movaps	%xmm6,-0xd8(%rax) +	movaps	%xmm7,-0xc8(%rax) +	movaps	%xmm8,-0xb8(%rax) +	movaps	%xmm9,-0xa8(%rax) +	movaps	%xmm10,-0x98(%rax) +	movaps	%xmm11,-0x88(%rax) +	movaps	%xmm12,-0x78(%rax) +	movaps	%xmm13,-0x68(%rax) +	movaps	%xmm14,-0x58(%rax) +	movaps	%xmm15,-0x48(%rax) +.Lgcm_dec_body: +___ +$code.=<<___; +	vzeroupper + +	vmovdqu		($ivp),$T1		# input counter value +	add		\$-128,%rsp +	mov		12($ivp),$counter +	lea		.Lbswap_mask(%rip),$const +	lea		-0x80($key),$in0	# borrow $in0 +	mov		\$0xf80,$end0		# borrow $end0 +	vmovdqu		($Xip),$Xi		# load Xi +	and		\$-128,%rsp		# ensure stack alignment +	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask +	lea		0x80($key),$key		# size optimization +	lea		0x20+0x20($Xip),$Xip	# size optimization +	mov		0xf0-0x80($key),$rounds +	vpshufb		$Ii,$Xi,$Xi + +	and		$end0,$in0 +	and		%rsp,$end0 +	sub		$in0,$end0 +	jc		.Ldec_no_key_aliasing +	cmp		\$768,$end0 +	jnc		.Ldec_no_key_aliasing +	sub		$end0,%rsp		# avoid aliasing with key +.Ldec_no_key_aliasing: + +	vmovdqu		0x50($inp),$Z3		# I[5] +	lea		($inp),$in0 +	vmovdqu		0x40($inp),$Z0 +	lea		-0xc0($inp,$len),$end0 +	vmovdqu		0x30($inp),$Z1 +	shr		\$4,$len +	xor		$ret,$ret +	vmovdqu		0x20($inp),$Z2 +	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x +	vmovdqu		0x10($inp),$T2 +	 vpshufb	$Ii,$Z0,$Z0 +	vmovdqu		($inp),$Hkey +	 vpshufb	$Ii,$Z1,$Z1 +	vmovdqu		$Z0,0x30(%rsp) +	 vpshufb	$Ii,$Z2,$Z2 +	vmovdqu		$Z1,0x40(%rsp) +	 vpshufb	$Ii,$T2,$T2 +	vmovdqu		$Z2,0x50(%rsp) +	 vpshufb	$Ii,$Hkey,$Hkey +	vmovdqu		$T2,0x60(%rsp) +	vmovdqu		$Hkey,0x70(%rsp) + +	call		_aesni_ctr32_ghash_6x + +	vmovups		$inout0,-0x60($out)	# save output +	vmovups		$inout1,-0x50($out) +	vmovups		$inout2,-0x40($out) +	vmovups		$inout3,-0x30($out) +	vmovups		$inout4,-0x20($out) +	vmovups		$inout5,-0x10($out) + +	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask +	vmovdqu		$Xi,-0x40($Xip)		# output Xi + +	vzeroupper +___ +$code.=<<___ if ($win64); +	movaps	-0xd8(%rax),%xmm6 +	movaps	-0xd8(%rax),%xmm7 +	movaps	-0xb8(%rax),%xmm8 +	movaps	-0xa8(%rax),%xmm9 +	movaps	-0x98(%rax),%xmm10 +	movaps	-0x88(%rax),%xmm11 +	movaps	-0x78(%rax),%xmm12 +	movaps	-0x68(%rax),%xmm13 +	movaps	-0x58(%rax),%xmm14 +	movaps	-0x48(%rax),%xmm15 +___ +$code.=<<___; +	mov	-48(%rax),%r15 +	mov	-40(%rax),%r14 +	mov	-32(%rax),%r13 +	mov	-24(%rax),%r12 +	mov	-16(%rax),%rbp +	mov	-8(%rax),%rbx +	lea	(%rax),%rsp		# restore %rsp +.Lgcm_dec_abort: +	mov	$ret,%rax		# return value +	ret +.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ + +$code.=<<___; +.type	_aesni_ctr32_6x,\@abi-omnipotent +.align	32 +_aesni_ctr32_6x: +	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey +	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb +	lea		-1($rounds),%r13 +	vmovups		0x10-0x80($key),$rndkey +	lea		0x20-0x80($key),%r12 +	vpxor		$Z0,$T1,$inout0 +	add		\$`6<<24`,$counter +	jc		.Lhandle_ctr32_2 +	vpaddb		$T2,$T1,$inout1 +	vpaddb		$T2,$inout1,$inout2 +	vpxor		$Z0,$inout1,$inout1 +	vpaddb		$T2,$inout2,$inout3 +	vpxor		$Z0,$inout2,$inout2 +	vpaddb		$T2,$inout3,$inout4 +	vpxor		$Z0,$inout3,$inout3 +	vpaddb		$T2,$inout4,$inout5 +	vpxor		$Z0,$inout4,$inout4 +	vpaddb		$T2,$inout5,$T1 +	vpxor		$Z0,$inout5,$inout5 +	jmp		.Loop_ctr32 + +.align	16 +.Loop_ctr32: +	vaesenc		$rndkey,$inout0,$inout0 +	vaesenc		$rndkey,$inout1,$inout1 +	vaesenc		$rndkey,$inout2,$inout2 +	vaesenc		$rndkey,$inout3,$inout3 +	vaesenc		$rndkey,$inout4,$inout4 +	vaesenc		$rndkey,$inout5,$inout5 +	vmovups		(%r12),$rndkey +	lea		0x10(%r12),%r12 +	dec		%r13d +	jnz		.Loop_ctr32 + +	vmovdqu		(%r12),$Hkey		# last round key +	vaesenc		$rndkey,$inout0,$inout0 +	vpxor		0x00($inp),$Hkey,$Z0 +	vaesenc		$rndkey,$inout1,$inout1 +	vpxor		0x10($inp),$Hkey,$Z1 +	vaesenc		$rndkey,$inout2,$inout2 +	vpxor		0x20($inp),$Hkey,$Z2 +	vaesenc		$rndkey,$inout3,$inout3 +	vpxor		0x30($inp),$Hkey,$Xi +	vaesenc		$rndkey,$inout4,$inout4 +	vpxor		0x40($inp),$Hkey,$T2 +	vaesenc		$rndkey,$inout5,$inout5 +	vpxor		0x50($inp),$Hkey,$Hkey +	lea		0x60($inp),$inp + +	vaesenclast	$Z0,$inout0,$inout0 +	vaesenclast	$Z1,$inout1,$inout1 +	vaesenclast	$Z2,$inout2,$inout2 +	vaesenclast	$Xi,$inout3,$inout3 +	vaesenclast	$T2,$inout4,$inout4 +	vaesenclast	$Hkey,$inout5,$inout5 +	vmovups		$inout0,0x00($out) +	vmovups		$inout1,0x10($out) +	vmovups		$inout2,0x20($out) +	vmovups		$inout3,0x30($out) +	vmovups		$inout4,0x40($out) +	vmovups		$inout5,0x50($out) +	lea		0x60($out),$out + +	ret +.align	32 +.Lhandle_ctr32_2: +	vpshufb		$Ii,$T1,$Z2		# byte-swap counter +	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb +	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb +	vpaddd		$Z1,$Z2,$inout2 +	vpaddd		$Z1,$inout1,$inout3 +	vpshufb		$Ii,$inout1,$inout1 +	vpaddd		$Z1,$inout2,$inout4 +	vpshufb		$Ii,$inout2,$inout2 +	vpxor		$Z0,$inout1,$inout1 +	vpaddd		$Z1,$inout3,$inout5 +	vpshufb		$Ii,$inout3,$inout3 +	vpxor		$Z0,$inout2,$inout2 +	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value +	vpshufb		$Ii,$inout4,$inout4 +	vpxor		$Z0,$inout3,$inout3 +	vpshufb		$Ii,$inout5,$inout5 +	vpxor		$Z0,$inout4,$inout4 +	vpshufb		$Ii,$T1,$T1		# next counter value +	vpxor		$Z0,$inout5,$inout5 +	jmp	.Loop_ctr32 +.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl	aesni_gcm_encrypt +.type	aesni_gcm_encrypt,\@function,6 +.align	32 +aesni_gcm_encrypt: +	xor	$ret,$ret +	cmp	\$0x60*3,$len			# minimal accepted length +	jb	.Lgcm_enc_abort + +	lea	(%rsp),%rax			# save stack pointer +	push	%rbx +	push	%rbp +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 +___ +$code.=<<___ if ($win64); +	lea	-0xa8(%rsp),%rsp +	movaps	%xmm6,-0xd8(%rax) +	movaps	%xmm7,-0xc8(%rax) +	movaps	%xmm8,-0xb8(%rax) +	movaps	%xmm9,-0xa8(%rax) +	movaps	%xmm10,-0x98(%rax) +	movaps	%xmm11,-0x88(%rax) +	movaps	%xmm12,-0x78(%rax) +	movaps	%xmm13,-0x68(%rax) +	movaps	%xmm14,-0x58(%rax) +	movaps	%xmm15,-0x48(%rax) +.Lgcm_enc_body: +___ +$code.=<<___; +	vzeroupper + +	vmovdqu		($ivp),$T1		# input counter value +	add		\$-128,%rsp +	mov		12($ivp),$counter +	lea		.Lbswap_mask(%rip),$const +	lea		-0x80($key),$in0	# borrow $in0 +	mov		\$0xf80,$end0		# borrow $end0 +	lea		0x80($key),$key		# size optimization +	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask +	and		\$-128,%rsp		# ensure stack alignment +	mov		0xf0-0x80($key),$rounds + +	and		$end0,$in0 +	and		%rsp,$end0 +	sub		$in0,$end0 +	jc		.Lenc_no_key_aliasing +	cmp		\$768,$end0 +	jnc		.Lenc_no_key_aliasing +	sub		$end0,%rsp		# avoid aliasing with key +.Lenc_no_key_aliasing: + +	lea		($out),$in0 +	lea		-0xc0($out,$len),$end0 +	shr		\$4,$len + +	call		_aesni_ctr32_6x +	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack +	vpshufb		$Ii,$inout1,$T2 +	vmovdqu		$Xi,0x70(%rsp) +	vpshufb		$Ii,$inout2,$Z0 +	vmovdqu		$T2,0x60(%rsp) +	vpshufb		$Ii,$inout3,$Z1 +	vmovdqu		$Z0,0x50(%rsp) +	vpshufb		$Ii,$inout4,$Z2 +	vmovdqu		$Z1,0x40(%rsp) +	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x +	vmovdqu		$Z2,0x30(%rsp) + +	call		_aesni_ctr32_6x + +	vmovdqu		($Xip),$Xi		# load Xi +	lea		0x20+0x20($Xip),$Xip	# size optimization +	sub		\$12,$len +	mov		\$0x60*2,$ret +	vpshufb		$Ii,$Xi,$Xi + +	call		_aesni_ctr32_ghash_6x +	vmovdqu		0x20(%rsp),$Z3		# I[5] +	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask +	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1 +	vpunpckhqdq	$Z3,$Z3,$T1 +	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK +	 vmovups	$inout0,-0x60($out)	# save output +	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy +	vpxor		$Z3,$T1,$T1 +	 vmovups	$inout1,-0x50($out) +	 vpshufb	$Ii,$inout1,$inout1 +	 vmovups	$inout2,-0x40($out) +	 vpshufb	$Ii,$inout2,$inout2 +	 vmovups	$inout3,-0x30($out) +	 vpshufb	$Ii,$inout3,$inout3 +	 vmovups	$inout4,-0x20($out) +	 vpshufb	$Ii,$inout4,$inout4 +	 vmovups	$inout5,-0x10($out) +	 vpshufb	$Ii,$inout5,$inout5 +	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0 +___ +{ my ($HK,$T3)=($rndkey,$inout0); + +$code.=<<___; +	 vmovdqu	0x30(%rsp),$Z2		# I[4] +	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2 +	 vpunpckhqdq	$Z2,$Z2,$T2 +	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1 +	 vpxor		$Z2,$T2,$T2 +	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3 +	vpclmulqdq	\$0x00,$HK,$T1,$T1 + +	 vmovdqu	0x40(%rsp),$T3		# I[3] +	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0 +	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3 +	vpxor		$Z1,$Z0,$Z0 +	 vpunpckhqdq	$T3,$T3,$Z1 +	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2 +	 vpxor		$T3,$Z1,$Z1 +	vpxor		$Z3,$Z2,$Z2 +	vpclmulqdq	\$0x10,$HK,$T2,$T2 +	 vmovdqu	0x50-0x20($Xip),$HK +	vpxor		$T1,$T2,$T2 + +	 vmovdqu	0x50(%rsp),$T1		# I[2] +	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3 +	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4 +	vpxor		$Z0,$Z3,$Z3 +	 vpunpckhqdq	$T1,$T1,$Z0 +	vpclmulqdq	\$0x11,$Hkey,$T3,$T3 +	 vpxor		$T1,$Z0,$Z0 +	vpxor		$Z2,$T3,$T3 +	vpclmulqdq	\$0x00,$HK,$Z1,$Z1 +	vpxor		$T2,$Z1,$Z1 + +	 vmovdqu	0x60(%rsp),$T2		# I[1] +	vpclmulqdq	\$0x00,$Ii,$T1,$Z2 +	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5 +	vpxor		$Z3,$Z2,$Z2 +	 vpunpckhqdq	$T2,$T2,$Z3 +	vpclmulqdq	\$0x11,$Ii,$T1,$T1 +	 vpxor		$T2,$Z3,$Z3 +	vpxor		$T3,$T1,$T1 +	vpclmulqdq	\$0x10,$HK,$Z0,$Z0 +	 vmovdqu	0x80-0x20($Xip),$HK +	vpxor		$Z1,$Z0,$Z0 + +	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0] +	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1 +	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6 +	 vpunpckhqdq	$Xi,$Xi,$T3 +	vpxor		$Z2,$Z1,$Z1 +	vpclmulqdq	\$0x11,$Hkey,$T2,$T2 +	 vpxor		$Xi,$T3,$T3 +	vpxor		$T1,$T2,$T2 +	vpclmulqdq	\$0x00,$HK,$Z3,$Z3 +	vpxor		$Z0,$Z3,$Z0 + +	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2 +	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1 +	 vpunpckhqdq	$inout5,$inout5,$T1 +	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi +	 vpxor		$inout5,$T1,$T1 +	vpxor		$Z1,$Z2,$Z1 +	vpclmulqdq	\$0x10,$HK,$T3,$T3 +	 vmovdqu	0x20-0x20($Xip),$HK +	vpxor		$T2,$Xi,$Z3 +	vpxor		$Z0,$T3,$Z2 + +	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2 +	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing +	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0 +	  vpxor		$T3,$Z2,$Z2 +	 vpunpckhqdq	$inout4,$inout4,$T2 +	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5 +	 vpxor		$inout4,$T2,$T2 +	  vpslldq	\$8,$Z2,$T3 +	vpclmulqdq	\$0x00,$HK,$T1,$T1 +	  vpxor		$T3,$Z1,$Xi +	  vpsrldq	\$8,$Z2,$Z2 +	  vpxor		$Z2,$Z3,$Z3 + +	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1 +	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3 +	vpxor		$Z0,$Z1,$Z1 +	 vpunpckhqdq	$inout3,$inout3,$T3 +	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4 +	 vpxor		$inout3,$T3,$T3 +	vpxor		$inout5,$inout4,$inout4 +	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase +	vpclmulqdq	\$0x10,$HK,$T2,$T2 +	 vmovdqu	0x50-0x20($Xip),$HK +	vpxor		$T1,$T2,$T2 + +	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0 +	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4 +	vpxor		$Z1,$Z0,$Z0 +	 vpunpckhqdq	$inout2,$inout2,$T1 +	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3 +	 vpxor		$inout2,$T1,$T1 +	vpxor		$inout4,$inout3,$inout3 +	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0 +	vpclmulqdq	\$0x00,$HK,$T3,$T3 +	vpxor		$T2,$T3,$T3 + +	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi +	  vxorps	$inout5,$Xi,$Xi + +	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1 +	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5 +	vpxor		$Z0,$Z1,$Z1 +	 vpunpckhqdq	$inout1,$inout1,$T2 +	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2 +	 vpxor		$inout1,$T2,$T2 +	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase +	vpxor		$inout3,$inout2,$inout2 +	vpclmulqdq	\$0x10,$HK,$T1,$T1 +	 vmovdqu	0x80-0x20($Xip),$HK +	vpxor		$T3,$T1,$T1 + +	  vxorps	$Z3,$inout5,$inout5 +	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi +	  vxorps	$inout5,$Xi,$Xi + +	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0 +	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6 +	vpxor		$Z1,$Z0,$Z0 +	 vpunpckhqdq	$Xi,$Xi,$T3 +	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1 +	 vpxor		$Xi,$T3,$T3 +	vpxor		$inout2,$inout1,$inout1 +	vpclmulqdq	\$0x00,$HK,$T2,$T2 +	vpxor		$T1,$T2,$T2 + +	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1 +	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3 +	vpxor		$Z0,$Z1,$Z1 +	vpclmulqdq	\$0x10,$HK,$T3,$Z2 +	vpxor		$inout1,$Z3,$Z3 +	vpxor		$T2,$Z2,$Z2 + +	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing +	vpxor		$Z0,$Z2,$Z2 +	vpslldq		\$8,$Z2,$T1 +	vmovdqu		0x10($const),$Hkey	# .Lpoly +	vpsrldq		\$8,$Z2,$Z2 +	vpxor		$T1,$Z1,$Xi +	vpxor		$Z2,$Z3,$Z3 + +	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase +	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi +	vpxor		$T2,$Xi,$Xi + +	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase +	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi +	vpxor		$Z3,$T2,$T2 +	vpxor		$T2,$Xi,$Xi +___ +} +$code.=<<___; +	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask +	vmovdqu		$Xi,-0x40($Xip)		# output Xi + +	vzeroupper +___ +$code.=<<___ if ($win64); +	movaps	-0xd8(%rax),%xmm6 +	movaps	-0xc8(%rax),%xmm7 +	movaps	-0xb8(%rax),%xmm8 +	movaps	-0xa8(%rax),%xmm9 +	movaps	-0x98(%rax),%xmm10 +	movaps	-0x88(%rax),%xmm11 +	movaps	-0x78(%rax),%xmm12 +	movaps	-0x68(%rax),%xmm13 +	movaps	-0x58(%rax),%xmm14 +	movaps	-0x48(%rax),%xmm15 +___ +$code.=<<___; +	mov	-48(%rax),%r15 +	mov	-40(%rax),%r14 +	mov	-32(%rax),%r13 +	mov	-24(%rax),%r12 +	mov	-16(%rax),%rbp +	mov	-8(%rax),%rbx +	lea	(%rax),%rsp		# restore %rsp +.Lgcm_enc_abort: +	mov	$ret,%rax		# return value +	ret +.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt +___ + +$code.=<<___; +.align	64 +.Lbswap_mask: +	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align	64 +___ +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ +.extern	__imp_RtlVirtualUnwind +.type	gcm_se_handler,\@abi-omnipotent +.align	16 +gcm_se_handler: +	push	%rsi +	push	%rdi +	push	%rbx +	push	%rbp +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 +	pushfq +	sub	\$64,%rsp + +	mov	120($context),%rax	# pull context->Rax +	mov	248($context),%rbx	# pull context->Rip + +	mov	8($disp),%rsi		# disp->ImageBase +	mov	56($disp),%r11		# disp->HandlerData + +	mov	0(%r11),%r10d		# HandlerData[0] +	lea	(%rsi,%r10),%r10	# prologue label +	cmp	%r10,%rbx		# context->Rip<prologue label +	jb	.Lcommon_seh_tail + +	mov	152($context),%rax	# pull context->Rsp + +	mov	4(%r11),%r10d		# HandlerData[1] +	lea	(%rsi,%r10),%r10	# epilogue label +	cmp	%r10,%rbx		# context->Rip>=epilogue label +	jae	.Lcommon_seh_tail + +	mov	120($context),%rax	# pull context->Rax + +	mov	-48(%rax),%r15 +	mov	-40(%rax),%r14 +	mov	-32(%rax),%r13 +	mov	-24(%rax),%r12 +	mov	-16(%rax),%rbp +	mov	-8(%rax),%rbx +	mov	%r15,240($context) +	mov	%r14,232($context) +	mov	%r13,224($context) +	mov	%r12,216($context) +	mov	%rbp,160($context) +	mov	%rbx,144($context) + +	lea	-0xd8(%rax),%rsi	# %xmm save area +	lea	512($context),%rdi	# & context.Xmm6 +	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax) +	.long	0xa548f3fc		# cld; rep movsq + +.Lcommon_seh_tail: +	mov	8(%rax),%rdi +	mov	16(%rax),%rsi +	mov	%rax,152($context)	# restore context->Rsp +	mov	%rsi,168($context)	# restore context->Rsi +	mov	%rdi,176($context)	# restore context->Rdi + +	mov	40($disp),%rdi		# disp->ContextRecord +	mov	$context,%rsi		# context +	mov	\$154,%ecx		# sizeof(CONTEXT) +	.long	0xa548f3fc		# cld; rep movsq + +	mov	$disp,%rsi +	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER +	mov	8(%rsi),%rdx		# arg2, disp->ImageBase +	mov	0(%rsi),%r8		# arg3, disp->ControlPc +	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry +	mov	40(%rsi),%r10		# disp->ContextRecord +	lea	56(%rsi),%r11		# &disp->HandlerData +	lea	24(%rsi),%r12		# &disp->EstablisherFrame +	mov	%r10,32(%rsp)		# arg5 +	mov	%r11,40(%rsp)		# arg6 +	mov	%r12,48(%rsp)		# arg7 +	mov	%rcx,56(%rsp)		# arg8, (NULL) +	call	*__imp_RtlVirtualUnwind(%rip) + +	mov	\$1,%eax		# ExceptionContinueSearch +	add	\$64,%rsp +	popfq +	pop	%r15 +	pop	%r14 +	pop	%r13 +	pop	%r12 +	pop	%rbp +	pop	%rbx +	pop	%rdi +	pop	%rsi +	ret +.size	gcm_se_handler,.-gcm_se_handler + +.section	.pdata +.align	4 +	.rva	.LSEH_begin_aesni_gcm_decrypt +	.rva	.LSEH_end_aesni_gcm_decrypt +	.rva	.LSEH_gcm_dec_info + +	.rva	.LSEH_begin_aesni_gcm_encrypt +	.rva	.LSEH_end_aesni_gcm_encrypt +	.rva	.LSEH_gcm_enc_info +.section	.xdata +.align	8 +.LSEH_gcm_dec_info: +	.byte	9,0,0,0 +	.rva	gcm_se_handler +	.rva	.Lgcm_dec_body,.Lgcm_dec_abort +.LSEH_gcm_enc_info: +	.byte	9,0,0,0 +	.rva	gcm_se_handler +	.rva	.Lgcm_enc_body,.Lgcm_enc_abort +___ +} +}}} else {{{ +$code=<<___;	# assembler is too old +.text + +.globl	aesni_gcm_encrypt +.type	aesni_gcm_encrypt,\@abi-omnipotent +aesni_gcm_encrypt: +	xor	%eax,%eax +	ret +.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl	aesni_gcm_decrypt +.type	aesni_gcm_decrypt,\@abi-omnipotent +aesni_gcm_decrypt: +	xor	%eax,%eax +	ret +.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ +}}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl index d91586ee2..77fbf3446 100644 --- a/openssl/crypto/modes/asm/ghash-armv4.pl +++ b/openssl/crypto/modes/asm/ghash-armv4.pl @@ -35,6 +35,20 @@  # Add NEON implementation featuring polynomial multiplication, i.e. no  # lookup tables involved. On Cortex A8 it was measured to process one  # byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - +# in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +#  +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf  # ====================================================================  # Note about "528B" variant. In ARM case it makes lesser sense to @@ -303,117 +317,161 @@ $code.=<<___;  .size	gcm_gmult_4bit,.-gcm_gmult_4bit  ___  { -my $cnt=$Htbl;	# $Htbl is used once in the very beginning - -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); -my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); - -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit -# in Zo. Or should I say "top bit", because GHASH is specified in -# reverse bit order? Otherwise straightforward 128-bt H by one input -# byte multiplication and modulo-reduction, times 16. +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); -sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     } -sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   } -sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; +	vext.8		$t0#lo, $a, $a, #1	@ A1 +	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B +	vext.8		$r#lo, $b, $b, #1	@ B1 +	vmull.p8	$r, $a, $r#lo		@ E = A*B1 +	vext.8		$t1#lo, $a, $a, #2	@ A2 +	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B +	vext.8		$t3#lo, $b, $b, #2	@ B2 +	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2 +	vext.8		$t2#lo, $a, $a, #3	@ A3 +	veor		$t0, $t0, $r		@ L = E + F +	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B +	vext.8		$r#lo, $b, $b, #3	@ B3 +	veor		$t1, $t1, $t3		@ M = G + H +	vmull.p8	$r, $a, $r#lo		@ I = A*B3 +	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8 +	vand		$t0#hi, $t0#hi, $k48 +	vext.8		$t3#lo, $b, $b, #4	@ B4 +	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16 +	vand		$t1#hi, $t1#hi, $k32 +	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4 +	veor		$t2, $t2, $r		@ N = I + J +	veor		$t0#lo, $t0#lo, $t0#hi +	veor		$t1#lo, $t1#lo, $t1#hi +	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24 +	vand		$t2#hi, $t2#hi, $k16 +	vext.8		$t0, $t0, $t0, #15 +	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32 +	vmov.i64	$t3#hi, #0 +	vext.8		$t1, $t1, $t1, #14 +	veor		$t2#lo, $t2#lo, $t2#hi +	vmull.p8	$r, $a, $b		@ D = A*B +	vext.8		$t3, $t3, $t3, #12 +	vext.8		$t2, $t2, $t2, #13 +	veor		$t0, $t0, $t1 +	veor		$t2, $t2, $t3 +	veor		$r, $r, $t0 +	veor		$r, $r, $t2 +___ +}  $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch	armv7-a  .fpu	neon +.global	gcm_init_neon +.type	gcm_init_neon,%function +.align	4 +gcm_init_neon: +	vld1.64		$IN#hi,[r1,:64]!	@ load H +	vmov.i8		$t0,#0xe1 +	vld1.64		$IN#lo,[r1,:64] +	vshl.i64	$t0#hi,#57 +	vshr.u64	$t0#lo,#63		@ t0=0xc2....01 +	vdup.8		$t1,$IN#hi[7] +	vshr.u64	$Hlo,$IN#lo,#63 +	vshr.s8		$t1,#7			@ broadcast carry bit +	vshl.i64	$IN,$IN,#1 +	vand		$t0,$t0,$t1 +	vorr		$IN#hi,$Hlo		@ H<<<=1 +	veor		$IN,$IN,$t0		@ twisted H +	vstmia		r0,{$IN} + +	ret					@ bx lr +.size	gcm_init_neon,.-gcm_init_neon +  .global	gcm_gmult_neon  .type	gcm_gmult_neon,%function  .align	4  gcm_gmult_neon: -	sub		$Htbl,#16		@ point at H in GCM128_CTX -	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial -	vld1.64		`&Dlo("$IN")`,[$Xi,:64]! -	vshr.u64	$mod,#32 -	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H -	veor		$zero,$zero +	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi +	vld1.64		$IN#lo,[$Xi,:64]! +	vmov.i64	$k48,#0x0000ffffffffffff +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H +	vmov.i64	$k32,#0x00000000ffffffff  #ifdef __ARMEL__  	vrev64.8	$IN,$IN  #endif -	veor		$Qpost,$Qpost -	veor		$R,$R -	mov		$cnt,#16 -	veor		$Z,$Z +	vmov.i64	$k16,#0x000000000000ffff +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing  	mov		$len,#16 -	veor		$Zo,$Zo -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -	b		.Linner_neon +	b		.Lgmult_neon  .size	gcm_gmult_neon,.-gcm_gmult_neon  .global	gcm_ghash_neon  .type	gcm_ghash_neon,%function  .align	4  gcm_ghash_neon: -	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi -	vmov.i32	$mod,#0xe1		@ our irreducible polynomial -	vld1.64		`&Dlo("$Z")`,[$Xi,:64]! -	vshr.u64	$mod,#32 -	vldmia		$Xi,{$Hhi-$Hlo}		@ load H -	veor		$zero,$zero -	nop +	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi +	vld1.64		$Xl#lo,[$Xi,:64]! +	vmov.i64	$k48,#0x0000ffffffffffff +	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H +	vmov.i64	$k32,#0x00000000ffffffff  #ifdef __ARMEL__ -	vrev64.8	$Z,$Z +	vrev64.8	$Xl,$Xl  #endif -.Louter_neon: -	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp -	veor		$Qpost,$Qpost -	vld1.64		`&Dlo($IN)`,[$inp]! -	veor		$R,$R -	mov		$cnt,#16 +	vmov.i64	$k16,#0x000000000000ffff +	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing + +.Loop_neon: +	vld1.64		$IN#hi,[$inp]!		@ load inp +	vld1.64		$IN#lo,[$inp]!  #ifdef __ARMEL__  	vrev64.8	$IN,$IN  #endif -	veor		$Zo,$Zo -	veor		$IN,$Z			@ inp^=Xi -	veor		$Z,$Z -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -.Linner_neon: -	subs		$cnt,$cnt,#1 -	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i] -	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i] -	vext.8		$IN,$zero,#1		@ IN>>=8 - -	veor		$Z,$Qpost		@ modulo-scheduled part -	vshl.i64	`&Dlo("$R")`,#48 -	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte -	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")` - -	veor		`&Dhi("$Z")`,`&Dlo("$R")` -	vuzp.8		$Qlo,$Qhi -	vsli.8		$Zo,$T,#1		@ compose the "carry" byte -	vext.8		$Z,$zero,#1		@ Z>>=8 - -	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1 -	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit -	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8 -	veor		$Z,$Qhi -	bne		.Linner_neon - -	veor		$Z,$Qpost		@ modulo-scheduled artefact -	vshl.i64	`&Dlo("$R")`,#48 -	veor		`&Dhi("$Z")`,`&Dlo("$R")` - -	@ finalization, normalize Z:Zo -	vand		$Zo,$mod		@ suffices to mask the bit -	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 -	vshl.i64	$Z,#1 +	veor		$IN,$Xl			@ inp^=Xi +.Lgmult_neon: +___ +	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo +$code.=<<___; +	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing +___ +	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi) +	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi +$code.=<<___; +	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing +	veor		$Xm,$Xm,$Xh +	veor		$Xl#hi,$Xl#hi,$Xm#lo +	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result + +	@ equivalent of reduction_avx from ghash-x86_64.pl +	vshl.i64	$t1,$Xl,#57		@ 1st phase +	vshl.i64	$t2,$Xl,#62 +	veor		$t2,$t2,$t1		@ +	vshl.i64	$t1,$Xl,#63 +	veor		$t2, $t2, $t1		@ + 	veor		$Xl#hi,$Xl#hi,$t2#lo	@ +	veor		$Xh#lo,$Xh#lo,$t2#hi + +	vshr.u64	$t2,$Xl,#1		@ 2nd phase +	veor		$Xh,$Xh,$Xl +	veor		$Xl,$Xl,$t2		@ +	vshr.u64	$t2,$t2,#6 +	vshr.u64	$Xl,$Xl,#1		@ +	veor		$Xl,$Xl,$Xh		@ +	veor		$Xl,$Xl,$t2		@ +  	subs		$len,#16 -	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1 -	bne		.Louter_neon +	bne		.Loop_neon  #ifdef __ARMEL__ -	vrev64.8	$Z,$Z +	vrev64.8	$Xl,$Xl  #endif  	sub		$Xi,#16	 -	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi -	vst1.64		`&Dlo("$Z")`,[$Xi,:64] +	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi +	vst1.64		$Xl#lo,[$Xi,:64] -	bx	lr +	ret					@ bx lr  .size	gcm_ghash_neon,.-gcm_ghash_neon  #endif  ___ @@ -423,7 +481,13 @@ $code.=<<___;  .align  2  ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4 -print $code; +foreach (split("\n",$code)) { +	s/\`([^\`]*)\`/eval $1/geo; + +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or +	s/\bret\b/bx	lr/go		or +	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4 + +	print $_,"\n"; +}  close STDOUT; # enforce flush diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl index 6a40d5d89..39096b423 100644 --- a/openssl/crypto/modes/asm/ghash-s390x.pl +++ b/openssl/crypto/modes/asm/ghash-s390x.pl @@ -186,13 +186,13 @@ $code.=<<___;  	sllg	$rem1,$Zlo,3  	xgr	$Zlo,$tmp  	ngr	$rem1,$x78 +	sllg	$tmp,$Zhi,60  	j	.Lghash_inner  .align	16  .Lghash_inner:  	srlg	$Zlo,$Zlo,4 -	sllg	$tmp,$Zhi,60 -	xg	$Zlo,8($nlo,$Htbl)  	srlg	$Zhi,$Zhi,4 +	xg	$Zlo,8($nlo,$Htbl)  	llgc	$xi,0($cnt,$Xi)  	xg	$Zhi,0($nlo,$Htbl)  	sllg	$nlo,$xi,4 @@ -213,9 +213,9 @@ $code.=<<___;  	sllg	$rem1,$Zlo,3  	xgr	$Zlo,$tmp  	ngr	$rem1,$x78 +	sllg	$tmp,$Zhi,60  	brct	$cnt,.Lghash_inner -	sllg	$tmp,$Zhi,60  	srlg	$Zlo,$Zlo,4  	srlg	$Zhi,$Zhi,4  	xg	$Zlo,8($nlo,$Htbl) diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl index 70e7b044a..0365e0f1f 100644 --- a/openssl/crypto/modes/asm/ghash-sparcv9.pl +++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl @@ -36,6 +36,15 @@  # references to input data and Z.hi updates to achieve 12 cycles  # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6  # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. +# +# October 2012 +# +# Add VIS3 lookup-table-free implementation using polynomial +# multiplication xmulx[hi] and extended addition addxc[cc] +# instructions. 4.52/7.63x improvement on T3/T4 or in absolute +# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark +# saturates at ~15.5x single-process result on 8-core processor, +# or ~20.5GBps per 2.85GHz socket.  $bits=32;  for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } @@ -66,6 +75,10 @@ $Htbl="%i1";  $inp="%i2";  $len="%i3"; +$code.=<<___ if ($bits==64); +.register	%g2,#scratch +.register	%g3,#scratch +___  $code.=<<___;  .section	".text",#alloc,#execinstr @@ -321,10 +334,238 @@ gcm_gmult_4bit:  	restore  .type	gcm_gmult_4bit,#function  .size	gcm_gmult_4bit,(.-gcm_gmult_4bit) -.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +___ + +{{{ +# Straightforward 128x128-bit multiplication using Karatsuba algorithm +# followed by pair of 64-bit reductions [with a shortcut in first one, +# which allowed to break dependency between reductions and remove one +# multiplication from critical path]. While it might be suboptimal +# with regard to sheer number of multiplications, other methods [such +# as aggregate reduction] would require more 64-bit registers, which +# we don't have in 32-bit application context. + +($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); + +($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= +	(map("%o$_",(0..5,7)),map("%g$_",(1..5))); + +($shl,$shr)=map("%l$_",(0..7)); + +# For details regarding "twisted H" see ghash-x86.pl. +$code.=<<___; +.globl	gcm_init_vis3 +.align	32 +gcm_init_vis3: +	save	%sp,-$frame,%sp + +	ldx	[%i1+0],$Hhi +	ldx	[%i1+8],$Hlo +	mov	0xE1,$Xhi +	mov	1,$Xlo +	sllx	$Xhi,57,$Xhi +	srax	$Hhi,63,$C0		! broadcast carry +	addcc	$Hlo,$Hlo,$Hlo		! H<<=1 +	addxc	$Hhi,$Hhi,$Hhi +	and	$C0,$Xlo,$Xlo +	and	$C0,$Xhi,$Xhi +	xor	$Xlo,$Hlo,$Hlo +	xor	$Xhi,$Hhi,$Hhi +	stx	$Hlo,[%i0+8]		! save twisted H +	stx	$Hhi,[%i0+0] + +	sethi	%hi(0xA0406080),$V +	sethi	%hi(0x20C0E000),%l0 +	or	$V,%lo(0xA0406080),$V +	or	%l0,%lo(0x20C0E000),%l0 +	sllx	$V,32,$V +	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000 +	stx	$V,[%i0+16] + +	ret +	restore +.type	gcm_init_vis3,#function +.size	gcm_init_vis3,.-gcm_init_vis3 + +.globl	gcm_gmult_vis3 +.align	32 +gcm_gmult_vis3: +	save	%sp,-$frame,%sp + +	ldx	[$Xip+8],$Xlo		! load Xi +	ldx	[$Xip+0],$Xhi +	ldx	[$Htable+8],$Hlo	! load twisted H +	ldx	[$Htable+0],$Hhi + +	mov	0xE1,%l7 +	sllx	%l7,57,$xE1		! 57 is not a typo +	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000 + +	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing +	xmulx	$Xlo,$Hlo,$C0 +	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing +	xmulx	$C2,$Hhl,$C1 +	xmulxhi	$Xlo,$Hlo,$Xlo +	xmulxhi	$C2,$Hhl,$C2 +	xmulxhi	$Xhi,$Hhi,$C3 +	xmulx	$Xhi,$Hhi,$Xhi + +	sll	$C0,3,$sqr +	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)] +	xor	$C0,$sqr,$sqr +	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f] + +	xor	$C0,$C1,$C1		! Karatsuba post-processing +	xor	$Xlo,$C2,$C2 +	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1 +	xor	$C3,$C2,$C2 +	xor	$Xlo,$C1,$C1 +	xor	$Xhi,$C2,$C2 +	xor	$Xhi,$C1,$C1 + +	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56 +	 xor	$C0,$C2,$C2 +	xmulx	$C1,$xE1,$C0 +	 xor	$C1,$C3,$C3 +	xmulxhi	$C1,$xE1,$C1 + +	xor	$Xlo,$C2,$C2 +	xor	$C0,$C2,$C2 +	xor	$C1,$C3,$C3 + +	stx	$C2,[$Xip+8]		! save Xi +	stx	$C3,[$Xip+0] + +	ret +	restore +.type	gcm_gmult_vis3,#function +.size	gcm_gmult_vis3,.-gcm_gmult_vis3 + +.globl	gcm_ghash_vis3 +.align	32 +gcm_ghash_vis3: +	save	%sp,-$frame,%sp + +	ldx	[$Xip+8],$C2		! load Xi +	ldx	[$Xip+0],$C3 +	ldx	[$Htable+8],$Hlo	! load twisted H +	ldx	[$Htable+0],$Hhi + +	mov	0xE1,%l7 +	sllx	%l7,57,$xE1		! 57 is not a typo +	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000 + +	and	$inp,7,$shl +	andn	$inp,7,$inp +	sll	$shl,3,$shl +	prefetch [$inp+63], 20 +	sub	%g0,$shl,$shr + +	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing +.Loop: +	ldx	[$inp+8],$Xlo +	brz,pt	$shl,1f +	ldx	[$inp+0],$Xhi + +	ldx	[$inp+16],$C1		! align data +	srlx	$Xlo,$shr,$C0 +	sllx	$Xlo,$shl,$Xlo +	sllx	$Xhi,$shl,$Xhi +	srlx	$C1,$shr,$C1 +	or	$C0,$Xhi,$Xhi +	or	$C1,$Xlo,$Xlo +1: +	add	$inp,16,$inp +	sub	$len,16,$len +	xor	$C2,$Xlo,$Xlo +	xor	$C3,$Xhi,$Xhi +	prefetch [$inp+63], 20 + +	xmulx	$Xlo,$Hlo,$C0 +	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing +	xmulx	$C2,$Hhl,$C1 +	xmulxhi	$Xlo,$Hlo,$Xlo +	xmulxhi	$C2,$Hhl,$C2 +	xmulxhi	$Xhi,$Hhi,$C3 +	xmulx	$Xhi,$Hhi,$Xhi + +	sll	$C0,3,$sqr +	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)] +	xor	$C0,$sqr,$sqr +	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f] + +	xor	$C0,$C1,$C1		! Karatsuba post-processing +	xor	$Xlo,$C2,$C2 +	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1 +	xor	$C3,$C2,$C2 +	xor	$Xlo,$C1,$C1 +	xor	$Xhi,$C2,$C2 +	xor	$Xhi,$C1,$C1 + +	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56 +	 xor	$C0,$C2,$C2 +	xmulx	$C1,$xE1,$C0 +	 xor	$C1,$C3,$C3 +	xmulxhi	$C1,$xE1,$C1 + +	xor	$Xlo,$C2,$C2 +	xor	$C0,$C2,$C2 +	brnz,pt	$len,.Loop +	xor	$C1,$C3,$C3 + +	stx	$C2,[$Xip+8]		! save Xi +	stx	$C3,[$Xip+0] + +	ret +	restore +.type	gcm_ghash_vis3,#function +.size	gcm_ghash_vis3,.-gcm_ghash_vis3 +___ +}}} +$code.=<<___; +.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"  .align	4  ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = (	"addxc"		=> 0x011, +		"addxccc"	=> 0x013, +		"xmulx"		=> 0x115, +		"xmulxhi"	=> 0x116	); + +    $ref = "$mnemonic\t$rs1,$rs2,$rd"; + +    if ($opf=$visopf{$mnemonic}) { +	foreach ($rs1,$rs2,$rd) { +	    return $ref if (!/%([goli])([0-9])/); +	    $_=$bias{$1}+$2; +	} + +	return	sprintf ".word\t0x%08x !%s", +			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, +			$ref; +    } else { +	return $ref; +    } +} + +foreach (split("\n",$code)) { +	s/\`([^\`]*)\`/eval $1/ge; + +	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ +		&unvis3($1,$2,$3,$4) +	 /ge; + +	print $_,"\n"; +} +  close STDOUT; diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl index 83c727e07..23a5527b3 100644 --- a/openssl/crypto/modes/asm/ghash-x86.pl +++ b/openssl/crypto/modes/asm/ghash-x86.pl @@ -12,25 +12,27 @@  # The module implements "4-bit" GCM GHASH function and underlying  # single multiplication operation in GF(2^128). "4-bit" means that it  # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two -# code paths: vanilla x86 and vanilla MMX. Former will be executed on -# 486 and Pentium, latter on all others. MMX GHASH features so called +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called  # "528B" variant of "4-bit" method utilizing additional 256+16 bytes  # of per-key storage [+512 bytes shared table]. Performance results  # are for streamed GHASH subroutine and are expressed in cycles per  # processed byte, less is better:  # -#		gcc 2.95.3(*)	MMX assembler	x86 assembler +#		gcc 2.95.3(*)	SSE assembler	x86 assembler  #  # Pentium	105/111(**)	-		50  # PIII		68 /75		12.2		24  # P4		125/125		17.8		84(***)  # Opteron	66 /70		10.1		30  # Core2		54 /67		8.4		18 +# Atom		105/105		16.8		53 +# VIA Nano	69 /71		13.0		27  #  # (*)	gcc 3.4.x was observed to generate few percent slower code,  #	which is one of reasons why 2.95.3 results were chosen,  #	another reason is lack of 3.4.x results for older CPUs; -#	comparison with MMX results is not completely fair, because C +#	comparison with SSE results is not completely fair, because C  #	results are for vanilla "256B" implementation, while  #	assembler results are for "528B";-)  # (**)	second number is result for code compiled with -fPIC flag, @@ -40,8 +42,8 @@  #  # To summarize, it's >2-5 times faster than gcc-generated code. To  # anchor it to something else SHA1 assembler processes one byte in -# 11-13 cycles on contemporary x86 cores. As for choice of MMX in -# particular, see comment at the end of the file... +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file...  # May 2010  # @@ -113,6 +115,16 @@  # similar manner resulted in almost 20% degradation on Sandy Bridge,  # where original 64-bit code processes one byte in 1.95 cycles. +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. +  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;  push(@INC,"${dir}","${dir}../../perlasm");  require "x86asm.pl"; @@ -822,17 +834,18 @@ $len="ebx";  &static_label("bswap");  sub clmul64x64_T2 {	# minimal "register" pressure -my ($Xhi,$Xi,$Hkey)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_;  	&movdqa		($Xhi,$Xi);		#  	&pshufd		($T1,$Xi,0b01001110); -	&pshufd		($T2,$Hkey,0b01001110); +	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));  	&pxor		($T1,$Xi);		# -	&pxor		($T2,$Hkey); +	&pxor		($T2,$Hkey)		if (!defined($HK)); +			$HK=$T2			if (!defined($HK));  	&pclmulqdq	($Xi,$Hkey,0x00);	#######  	&pclmulqdq	($Xhi,$Hkey,0x11);	####### -	&pclmulqdq	($T1,$T2,0x00);		####### +	&pclmulqdq	($T1,$HK,0x00);		#######  	&xorps		($T1,$Xi);		#  	&xorps		($T1,$Xhi);		# @@ -879,31 +892,32 @@ if (1) {		# Algorithm 9 with <<1 twist.  			# below. Algorithm 9 was therefore chosen for  			# further optimization... -sub reduction_alg9 {	# 17/13 times faster than Intel version +sub reduction_alg9 {	# 17/11 times faster than Intel version  my ($Xhi,$Xi) = @_;  	# 1st phase -	&movdqa		($T1,$Xi);		# +	&movdqa		($T2,$Xi);		# +	&movdqa		($T1,$Xi); +	&psllq		($Xi,5); +	&pxor		($T1,$Xi);		#  	&psllq		($Xi,1);  	&pxor		($Xi,$T1);		# -	&psllq		($Xi,5);		# -	&pxor		($Xi,$T1);		#  	&psllq		($Xi,57);		# -	&movdqa		($T2,$Xi);		# +	&movdqa		($T1,$Xi);		#  	&pslldq		($Xi,8); -	&psrldq		($T2,8);		# -	&pxor		($Xi,$T1); -	&pxor		($Xhi,$T2);		# +	&psrldq		($T1,8);		#	 +	&pxor		($Xi,$T2); +	&pxor		($Xhi,$T1);		#  	# 2nd phase  	&movdqa		($T2,$Xi); +	&psrlq		($Xi,1); +	&pxor		($Xhi,$T2);		# +	&pxor		($T2,$Xi);  	&psrlq		($Xi,5);  	&pxor		($Xi,$T2);		#  	&psrlq		($Xi,1);		# -	&pxor		($Xi,$T2);		# -	&pxor		($T2,$Xhi); -	&psrlq		($Xi,1);		# -	&pxor		($Xi,$T2);		# +	&pxor		($Xi,$Xhi)		#  }  &function_begin_B("gcm_init_clmul"); @@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;  	&clmul64x64_T2	($Xhi,$Xi,$Hkey);  	&reduction_alg9	($Xhi,$Xi); +	&pshufd		($T1,$Hkey,0b01001110); +	&pshufd		($T2,$Xi,0b01001110); +	&pxor		($T1,$Hkey);		# Karatsuba pre-processing  	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H +	&pxor		($T2,$Xi);		# Karatsuba pre-processing  	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2 +	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi +	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"  	&ret		();  &function_end_B("gcm_init_clmul"); @@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;  	&movdqa		($T3,&QWP(0,$const));  	&movups		($Hkey,&QWP(0,$Htbl));  	&pshufb		($Xi,$T3); +	&movups		($T2,&QWP(32,$Htbl)); -	&clmul64x64_T2	($Xhi,$Xi,$Hkey); +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);  	&reduction_alg9	($Xhi,$Xi);  	&pshufb		($Xi,$T3); @@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;  	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1  	&pshufb		($T1,$T3);  	&pshufb		($Xn,$T3); +	&movdqu		($T3,&QWP(32,$Htbl));  	&pxor		($Xi,$T1);		# Ii+Xi -	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1 +	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1 +	&movdqa		($Xhn,$Xn); +	&pxor		($T1,$Xn);		# +	&lea		($inp,&DWP(32,$inp));	# i+=2 + +	&pclmulqdq	($Xn,$Hkey,0x00);	####### +	&pclmulqdq	($Xhn,$Hkey,0x11);	####### +	&pclmulqdq	($T1,$T3,0x00);		#######  	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2 +	&nop		(); -	&lea		($inp,&DWP(32,$inp));	# i+=2  	&sub		($len,0x20);  	&jbe		(&label("even_tail")); +	&jmp		(&label("mod_loop")); -&set_label("mod_loop"); -	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi) -	&movdqu		($T1,&QWP(0,$inp));	# Ii -	&movups		($Hkey,&QWP(0,$Htbl));	# load H +&set_label("mod_loop",32); +	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi) +	&movdqa		($Xhi,$Xi); +	&pxor		($T2,$Xi);		# +	&nop		(); -	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi) -	&pxor		($Xhi,$Xhn); +	&pclmulqdq	($Xi,$Hkey,0x00);	####### +	&pclmulqdq	($Xhi,$Hkey,0x11);	####### +	&pclmulqdq	($T2,$T3,0x10);		####### +	&movups		($Hkey,&QWP(0,$Htbl));	# load H -	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1 -	&pshufb		($T1,$T3); -	&pshufb		($Xn,$T3); +	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi) +	&movdqa		($T3,&QWP(0,$const)); +	&xorps		($Xhi,$Xhn); +	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii +	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing +	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1 +	&pxor		($T1,$Xhi);		# -	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1 -	&movdqa		($Xhn,$Xn); -	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early +	 &pshufb	($Xhn,$T3); +	&pxor		($T2,$T1);		# -	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase +	&movdqa		($T1,$T2);		# +	&psrldq		($T2,8); +	&pslldq		($T1,8);		# +	&pxor		($Xhi,$T2); +	&pxor		($Xi,$T1);		# +	 &pshufb	($Xn,$T3); +	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early + +	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1 +	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase +	  &movdqa	($T1,$Xi); +	  &psllq	($Xi,5); +	  &pxor		($T1,$Xi);		#  	  &psllq	($Xi,1);  	  &pxor		($Xi,$T1);		# -	  &psllq	($Xi,5);		# -	  &pxor		($Xi,$T1);		#  	&pclmulqdq	($Xn,$Hkey,0x00);	####### +	&movups		($T3,&QWP(32,$Htbl));  	  &psllq	($Xi,57);		# -	  &movdqa	($T2,$Xi);		# +	  &movdqa	($T1,$Xi);		#  	  &pslldq	($Xi,8); -	  &psrldq	($T2,8);		#	 -	  &pxor		($Xi,$T1); -	&pshufd		($T1,$T3,0b01001110); +	  &psrldq	($T1,8);		#	 +	  &pxor		($Xi,$T2); +	  &pxor		($Xhi,$T1);		# +	&pshufd		($T1,$Xhn,0b01001110); +	  &movdqa	($T2,$Xi);		# 2nd phase +	  &psrlq	($Xi,1); +	&pxor		($T1,$Xhn);  	  &pxor		($Xhi,$T2);		# -	&pxor		($T1,$T3); -	&pshufd		($T3,$Hkey,0b01001110); -	&pxor		($T3,$Hkey);		# -  	&pclmulqdq	($Xhn,$Hkey,0x11);	####### -	  &movdqa	($T2,$Xi);		# 2nd phase +	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2 +	  &pxor		($T2,$Xi);  	  &psrlq	($Xi,5);  	  &pxor		($Xi,$T2);		#  	  &psrlq	($Xi,1);		# -	  &pxor		($Xi,$T2);		# -	  &pxor		($T2,$Xhi); -	  &psrlq	($Xi,1);		# -	  &pxor		($Xi,$T2);		# - +	  &pxor		($Xi,$Xhi)		#  	&pclmulqdq	($T1,$T3,0x00);		####### -	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2 -	&xorps		($T1,$Xn);		# -	&xorps		($T1,$Xhn);		# - -	&movdqa		($T3,$T1);		# -	&psrldq		($T1,8); -	&pslldq		($T3,8);		# -	&pxor		($Xhn,$T1); -	&pxor		($Xn,$T3);		# -	&movdqa		($T3,&QWP(0,$const));  	&lea		($inp,&DWP(32,$inp));  	&sub		($len,0x20);  	&ja		(&label("mod_loop"));  &set_label("even_tail"); -	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi) +	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi) +	&movdqa		($Xhi,$Xi); +	&pxor		($T2,$Xi);		# -	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi) -	&pxor		($Xhi,$Xhn); +	&pclmulqdq	($Xi,$Hkey,0x00);	####### +	&pclmulqdq	($Xhi,$Hkey,0x11);	####### +	&pclmulqdq	($T2,$T3,0x10);		####### +	&movdqa		($T3,&QWP(0,$const)); + +	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi) +	&xorps		($Xhi,$Xhn); +	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing +	&pxor		($T1,$Xhi);		# + +	&pxor		($T2,$T1);		# + +	&movdqa		($T1,$T2);		# +	&psrldq		($T2,8); +	&pslldq		($T1,8);		# +	&pxor		($Xhi,$T2); +	&pxor		($Xi,$T1);		#  	&reduction_alg9	($Xhi,$Xi); @@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;  &set_label("bswap",64);  	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);  	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial -}}	# $sse2 - -&set_label("rem_4bit",64); -	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); -	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); -	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); -	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);  &set_label("rem_8bit",64);  	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);  	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); @@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;  	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);  	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);  	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}}	# $sse2 + +&set_label("rem_4bit",64); +	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); +	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); +	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); +	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);  }}}	# !$x86only  &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl index 38d779edb..6e656ca13 100644 --- a/openssl/crypto/modes/asm/ghash-x86_64.pl +++ b/openssl/crypto/modes/asm/ghash-x86_64.pl @@ -22,6 +22,8 @@  # P4		28.6		14.0		+100%  # Opteron	19.3		7.7		+150%  # Core2		17.8		8.1(**)		+120% +# Atom		31.6		16.8		+88% +# VIA Nano	21.8		10.1		+115%  #  # (*)	comparison is not completely fair, because C results are  #	for vanilla "256B" implementation, while assembler results @@ -39,6 +41,44 @@  # providing access to a Westmere-based system on behalf of Intel  # Open Source Technology Centre. +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere	1.78(+13%) +# Sandy Bridge	1.80(+8%) +# Ivy Bridge	1.80(+7%) +# Haswell	0.55(+93%) (if system doesn't support AVX) +# Broadwell	0.45(+110%)(if system doesn't support AVX) +# Bulldozer	1.49(+27%) +# Silvermont	2.88(+13%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, and in +# 0.29 on Broadwell. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +  $flavour = shift;  $output  = shift;  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or  die "can't locate x86_64-xlate.pl"; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` +		=~ /GNU assembler version ([2-9]\.[0-9]+)/) { +	$avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && +	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { +	$avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && +	    `ml64 2>&1` =~ /Version ([0-9]+)\./) { +	$avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +	$avx = ($2>=3.0) + ($2>3.0); +} +  open OUT,"| \"$^X\" $xlate $flavour $output";  *STDOUT=*OUT; +$do4xaggr=1; +  # common register layout  $nlo="%rax";  $nhi="%rbx"; @@ -160,6 +221,7 @@ ___  $code=<<___;  .text +.extern	OPENSSL_ia32cap_P  .globl	gcm_gmult_4bit  .type	gcm_gmult_4bit,\@function,2 @@ -352,19 +414,27 @@ ___  ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");  sub clmul64x64_T2 {	# minimal register pressure -my ($Xhi,$Xi,$Hkey,$modulo)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_; -$code.=<<___ if (!defined($modulo)); +if (!defined($HK)) {	$HK = $T2; +$code.=<<___;  	movdqa		$Xi,$Xhi		#  	pshufd		\$0b01001110,$Xi,$T1  	pshufd		\$0b01001110,$Hkey,$T2  	pxor		$Xi,$T1			#  	pxor		$Hkey,$T2  ___ +} else { +$code.=<<___; +	movdqa		$Xi,$Xhi		# +	pshufd		\$0b01001110,$Xi,$T1 +	pxor		$Xi,$T1			# +___ +}  $code.=<<___;  	pclmulqdq	\$0x00,$Hkey,$Xi	#######  	pclmulqdq	\$0x11,$Hkey,$Xhi	####### -	pclmulqdq	\$0x00,$T2,$T1		####### +	pclmulqdq	\$0x00,$HK,$T1		#######  	pxor		$Xi,$T1			#  	pxor		$Xhi,$T1		# @@ -376,42 +446,53 @@ $code.=<<___;  ___  } -sub reduction_alg9 {	# 17/13 times faster than Intel version +sub reduction_alg9 {	# 17/11 times faster than Intel version  my ($Xhi,$Xi) = @_;  $code.=<<___;  	# 1st phase -	movdqa		$Xi,$T1			# +	movdqa		$Xi,$T2			# +	movdqa		$Xi,$T1 +	psllq		\$5,$Xi +	pxor		$Xi,$T1			#  	psllq		\$1,$Xi  	pxor		$T1,$Xi			# -	psllq		\$5,$Xi			# -	pxor		$T1,$Xi			#  	psllq		\$57,$Xi		# -	movdqa		$Xi,$T2			# +	movdqa		$Xi,$T1			#  	pslldq		\$8,$Xi -	psrldq		\$8,$T2			#	 -	pxor		$T1,$Xi -	pxor		$T2,$Xhi		# +	psrldq		\$8,$T1			#	 +	pxor		$T2,$Xi +	pxor		$T1,$Xhi		#  	# 2nd phase  	movdqa		$Xi,$T2 +	psrlq		\$1,$Xi +	pxor		$T2,$Xhi		# +	pxor		$Xi,$T2  	psrlq		\$5,$Xi  	pxor		$T2,$Xi			#  	psrlq		\$1,$Xi			# -	pxor		$T2,$Xi			# -	pxor		$Xhi,$T2 -	psrlq		\$1,$Xi			# -	pxor		$T2,$Xi			# +	pxor		$Xhi,$Xi		#  ___  }  { my ($Htbl,$Xip)=@_4args; +  my $HK="%xmm6";  $code.=<<___;  .globl	gcm_init_clmul  .type	gcm_init_clmul,\@abi-omnipotent  .align	16  gcm_init_clmul: +.L_init_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_clmul: +	# I can't trust assembler to use specific encoding:-( +	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp +	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp) +___ +$code.=<<___;  	movdqu		($Xip),$Hkey  	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap @@ -430,13 +511,47 @@ gcm_init_clmul:  	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial  	# calculate H^2 +	pshufd		\$0b01001110,$Hkey,$HK  	movdqa		$Hkey,$Xi +	pxor		$Hkey,$HK  ___ -	&clmul64x64_T2	($Xhi,$Xi,$Hkey); +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);  	&reduction_alg9	($Xhi,$Xi);  $code.=<<___; -	movdqu		$Hkey,($Htbl)		# save H -	movdqu		$Xi,16($Htbl)		# save H^2 +	pshufd		\$0b01001110,$Hkey,$T1 +	pshufd		\$0b01001110,$Xi,$T2 +	pxor		$Hkey,$T1		# Karatsuba pre-processing +	movdqu		$Hkey,0x00($Htbl)	# save H +	pxor		$Xi,$T2			# Karatsuba pre-processing +	movdqu		$Xi,0x10($Htbl)		# save H^2 +	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi... +	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt" +___ +if ($do4xaggr) { +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3 +	&reduction_alg9	($Xhi,$Xi); +$code.=<<___; +	movdqa		$Xi,$T3 +___ +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4 +	&reduction_alg9	($Xhi,$Xi); +$code.=<<___; +	pshufd		\$0b01001110,$T3,$T1 +	pshufd		\$0b01001110,$Xi,$T2 +	pxor		$T3,$T1			# Karatsuba pre-processing +	movdqu		$T3,0x30($Htbl)		# save H^3 +	pxor		$Xi,$T2			# Karatsuba pre-processing +	movdqu		$Xi,0x40($Htbl)		# save H^4 +	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi... +	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt" +___ +} +$code.=<<___ if ($win64); +	movaps	(%rsp),%xmm6 +	lea	0x18(%rsp),%rsp +.LSEH_end_gcm_init_clmul: +___ +$code.=<<___;  	ret  .size	gcm_init_clmul,.-gcm_init_clmul  ___ @@ -449,13 +564,38 @@ $code.=<<___;  .type	gcm_gmult_clmul,\@abi-omnipotent  .align	16  gcm_gmult_clmul: +.L_gmult_clmul:  	movdqu		($Xip),$Xi  	movdqa		.Lbswap_mask(%rip),$T3  	movdqu		($Htbl),$Hkey +	movdqu		0x20($Htbl),$T2  	pshufb		$T3,$Xi  ___ -	&clmul64x64_T2	($Xhi,$Xi,$Hkey); -	&reduction_alg9	($Xhi,$Xi); +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2); +$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); +	# experimental alternative. special thing about is that there +	# no dependency between the two multiplications...  +	mov		\$`0xE1<<1`,%eax +	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff +	mov		\$0x07,%r11d +	movq		%rax,$T1 +	movq		%r10,$T2 +	movq		%r11,$T3		# borrow $T3 +	pand		$Xi,$T3 +	pshufb		$T3,$T2			# ($Xi&7)·0xE0 +	movq		%rax,$T3 +	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1) +	pxor		$Xi,$T2 +	pslldq		\$15,$T2 +	paddd		$T2,$T2			# <<(64+56+1) +	pxor		$T2,$Xi +	pclmulqdq	\$0x01,$T3,$Xi +	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3 +	psrldq		\$1,$T1 +	pxor		$T1,$Xhi +	pslldq		\$7,$Xi +	pxor		$Xhi,$Xi +___  $code.=<<___;  	pshufb		$T3,$Xi  	movdqu		$Xi,($Xip) @@ -465,129 +605,327 @@ ___  }  { my ($Xip,$Htbl,$inp,$len)=@_4args; -  my $Xn="%xmm6"; -  my $Xhn="%xmm7"; -  my $Hkey2="%xmm8"; -  my $T1n="%xmm9"; -  my $T2n="%xmm10"; +  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); +  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));  $code.=<<___;  .globl	gcm_ghash_clmul  .type	gcm_ghash_clmul,\@abi-omnipotent -.align	16 +.align	32  gcm_ghash_clmul: +.L_ghash_clmul:  ___  $code.=<<___ if ($win64); +	lea	-0x88(%rsp),%rax  .LSEH_begin_gcm_ghash_clmul:  	# I can't trust assembler to use specific encoding:-( -	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp -	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp) -	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp) -	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp) -	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp) -	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp) +	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp +	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax) +	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax) +	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax) +	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax) +	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax) +	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax) +	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax) +	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax) +	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax) +	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)  ___  $code.=<<___;  	movdqa		.Lbswap_mask(%rip),$T3  	movdqu		($Xip),$Xi  	movdqu		($Htbl),$Hkey +	movdqu		0x20($Htbl),$HK  	pshufb		$T3,$Xi  	sub		\$0x10,$len  	jz		.Lodd_tail -	movdqu		16($Htbl),$Hkey2 +	movdqu		0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; +	mov		OPENSSL_ia32cap_P+4(%rip),%eax +	cmp		\$0x30,$len +	jb		.Lskip4x + +	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE +	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE +	je		.Lskip4x + +	sub		\$0x30,$len +	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff +	movdqu		0x30($Htbl),$Hkey3 +	movdqu		0x40($Htbl),$Hkey4 + +	####### +	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P +	# +	movdqu		0x30($inp),$Xln +	 movdqu		0x20($inp),$Xl +	pshufb		$T3,$Xln +	 pshufb		$T3,$Xl +	movdqa		$Xln,$Xhn +	pshufd		\$0b01001110,$Xln,$Xmn +	pxor		$Xln,$Xmn +	pclmulqdq	\$0x00,$Hkey,$Xln +	pclmulqdq	\$0x11,$Hkey,$Xhn +	pclmulqdq	\$0x00,$HK,$Xmn + +	movdqa		$Xl,$Xh +	pshufd		\$0b01001110,$Xl,$Xm +	pxor		$Xl,$Xm +	pclmulqdq	\$0x00,$Hkey2,$Xl +	pclmulqdq	\$0x11,$Hkey2,$Xh +	pclmulqdq	\$0x10,$HK,$Xm +	xorps		$Xl,$Xln +	xorps		$Xh,$Xhn +	movups		0x50($Htbl),$HK +	xorps		$Xm,$Xmn + +	movdqu		0x10($inp),$Xl +	 movdqu		0($inp),$T1 +	pshufb		$T3,$Xl +	 pshufb		$T3,$T1 +	movdqa		$Xl,$Xh +	pshufd		\$0b01001110,$Xl,$Xm +	 pxor		$T1,$Xi +	pxor		$Xl,$Xm +	pclmulqdq	\$0x00,$Hkey3,$Xl +	 movdqa		$Xi,$Xhi +	 pshufd		\$0b01001110,$Xi,$T1 +	 pxor		$Xi,$T1 +	pclmulqdq	\$0x11,$Hkey3,$Xh +	pclmulqdq	\$0x00,$HK,$Xm +	xorps		$Xl,$Xln +	xorps		$Xh,$Xhn + +	lea	0x40($inp),$inp +	sub	\$0x40,$len +	jc	.Ltail4x + +	jmp	.Lmod4_loop +.align	32 +.Lmod4_loop: +	pclmulqdq	\$0x00,$Hkey4,$Xi +	xorps		$Xm,$Xmn +	 movdqu		0x30($inp),$Xl +	 pshufb		$T3,$Xl +	pclmulqdq	\$0x11,$Hkey4,$Xhi +	xorps		$Xln,$Xi +	 movdqu		0x20($inp),$Xln +	 movdqa		$Xl,$Xh +	pclmulqdq	\$0x10,$HK,$T1 +	 pshufd		\$0b01001110,$Xl,$Xm +	xorps		$Xhn,$Xhi +	 pxor		$Xl,$Xm +	 pshufb		$T3,$Xln +	movups		0x20($Htbl),$HK +	xorps		$Xmn,$T1 +	 pclmulqdq	\$0x00,$Hkey,$Xl +	 pshufd		\$0b01001110,$Xln,$Xmn + +	pxor		$Xi,$T1			# aggregated Karatsuba post-processing +	 movdqa		$Xln,$Xhn +	pxor		$Xhi,$T1		# +	 pxor		$Xln,$Xmn +	movdqa		$T1,$T2			# +	 pclmulqdq	\$0x11,$Hkey,$Xh +	pslldq		\$8,$T1 +	psrldq		\$8,$T2			# +	pxor		$T1,$Xi +	movdqa		.L7_mask(%rip),$T1 +	pxor		$T2,$Xhi		# +	movq		%rax,$T2 + +	pand		$Xi,$T1			# 1st phase +	pshufb		$T1,$T2			# +	pxor		$Xi,$T2			# +	 pclmulqdq	\$0x00,$HK,$Xm +	psllq		\$57,$T2		# +	movdqa		$T2,$T1			# +	pslldq		\$8,$T2 +	 pclmulqdq	\$0x00,$Hkey2,$Xln +	psrldq		\$8,$T1			#	 +	pxor		$T2,$Xi +	pxor		$T1,$Xhi		# +	movdqu		0($inp),$T1 + +	movdqa		$Xi,$T2			# 2nd phase +	psrlq		\$1,$Xi +	 pclmulqdq	\$0x11,$Hkey2,$Xhn +	 xorps		$Xl,$Xln +	 movdqu		0x10($inp),$Xl +	 pshufb		$T3,$Xl +	 pclmulqdq	\$0x10,$HK,$Xmn +	 xorps		$Xh,$Xhn +	 movups		0x50($Htbl),$HK +	pshufb		$T3,$T1 +	pxor		$T2,$Xhi		# +	pxor		$Xi,$T2 +	psrlq		\$5,$Xi + +	 movdqa		$Xl,$Xh +	 pxor		$Xm,$Xmn +	 pshufd		\$0b01001110,$Xl,$Xm +	pxor		$T2,$Xi			# +	pxor		$T1,$Xhi +	 pxor		$Xl,$Xm +	 pclmulqdq	\$0x00,$Hkey3,$Xl +	psrlq		\$1,$Xi			# +	pxor		$Xhi,$Xi		# +	movdqa		$Xi,$Xhi +	 pclmulqdq	\$0x11,$Hkey3,$Xh +	 xorps		$Xl,$Xln +	pshufd		\$0b01001110,$Xi,$T1 +	pxor		$Xi,$T1 + +	 pclmulqdq	\$0x00,$HK,$Xm +	 xorps		$Xh,$Xhn + +	lea	0x40($inp),$inp +	sub	\$0x40,$len +	jnc	.Lmod4_loop + +.Ltail4x: +	pclmulqdq	\$0x00,$Hkey4,$Xi +	pclmulqdq	\$0x11,$Hkey4,$Xhi +	pclmulqdq	\$0x10,$HK,$T1 +	xorps		$Xm,$Xmn +	xorps		$Xln,$Xi +	xorps		$Xhn,$Xhi +	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing +	pxor		$Xmn,$T1 + +	pxor		$Xhi,$T1		# +	pxor		$Xi,$Xhi + +	movdqa		$T1,$T2			# +	psrldq		\$8,$T1 +	pslldq		\$8,$T2			# +	pxor		$T1,$Xhi +	pxor		$T2,$Xi			# +___ +	&reduction_alg9($Xhi,$Xi); +$code.=<<___; +	add	\$0x40,$len +	jz	.Ldone +	movdqu	0x20($Htbl),$HK +	sub	\$0x10,$len +	jz	.Lodd_tail +.Lskip4x: +___ +} +$code.=<<___;  	#######  	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =  	#	[(H*Ii+1) + (H*Xi+1)] mod P =  	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P  	#  	movdqu		($inp),$T1		# Ii -	movdqu		16($inp),$Xn		# Ii+1 +	movdqu		16($inp),$Xln		# Ii+1  	pshufb		$T3,$T1 -	pshufb		$T3,$Xn +	pshufb		$T3,$Xln  	pxor		$T1,$Xi			# Ii+Xi -___ -	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1 -$code.=<<___; -	movdqa		$Xi,$Xhi		# -	pshufd		\$0b01001110,$Xi,$T1 -	pshufd		\$0b01001110,$Hkey2,$T2 -	pxor		$Xi,$T1			# -	pxor		$Hkey2,$T2 + +	movdqa		$Xln,$Xhn +	pshufd		\$0b01001110,$Xln,$Xmn +	pxor		$Xln,$Xmn +	pclmulqdq	\$0x00,$Hkey,$Xln +	pclmulqdq	\$0x11,$Hkey,$Xhn +	pclmulqdq	\$0x00,$HK,$Xmn  	lea		32($inp),$inp		# i+=2 +	nop  	sub		\$0x20,$len  	jbe		.Leven_tail +	nop +	jmp		.Lmod_loop +.align	32  .Lmod_loop: -___ -	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi) -$code.=<<___; -	movdqu		($inp),$T1		# Ii -	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi) -	pxor		$Xhn,$Xhi +	movdqa		$Xi,$Xhi +	movdqa		$Xmn,$T1 +	pshufd		\$0b01001110,$Xi,$Xmn	# +	pxor		$Xi,$Xmn		# -	movdqu		16($inp),$Xn		# Ii+1 -	pshufb		$T3,$T1 -	pshufb		$T3,$Xn +	pclmulqdq	\$0x00,$Hkey2,$Xi +	pclmulqdq	\$0x11,$Hkey2,$Xhi +	pclmulqdq	\$0x10,$HK,$Xmn -	movdqa		$Xn,$Xhn		# -	pshufd		\$0b01001110,$Xn,$T1n -	pshufd		\$0b01001110,$Hkey,$T2n -	pxor		$Xn,$T1n		# -	pxor		$Hkey,$T2n -	 pxor		$T1,$Xhi		# "Ii+Xi", consume early +	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi) +	pxor		$Xhn,$Xhi +	  movdqu	($inp),$T2		# Ii +	pxor		$Xi,$T1			# aggregated Karatsuba post-processing +	  pshufb	$T3,$T2 +	  movdqu	16($inp),$Xln		# Ii+1 + +	pxor		$Xhi,$T1 +	  pxor		$T2,$Xhi		# "Ii+Xi", consume early +	pxor		$T1,$Xmn +	 pshufb		$T3,$Xln +	movdqa		$Xmn,$T1		# +	psrldq		\$8,$T1 +	pslldq		\$8,$Xmn		# +	pxor		$T1,$Xhi +	pxor		$Xmn,$Xi		# + +	movdqa		$Xln,$Xhn		# -	  movdqa	$Xi,$T1			# 1st phase +	  movdqa	$Xi,$T2			# 1st phase +	  movdqa	$Xi,$T1 +	  psllq		\$5,$Xi +	  pxor		$Xi,$T1			# +	pclmulqdq	\$0x00,$Hkey,$Xln	#######  	  psllq		\$1,$Xi  	  pxor		$T1,$Xi			# -	  psllq		\$5,$Xi			# -	  pxor		$T1,$Xi			# -	pclmulqdq	\$0x00,$Hkey,$Xn	#######  	  psllq		\$57,$Xi		# -	  movdqa	$Xi,$T2			# +	  movdqa	$Xi,$T1			#  	  pslldq	\$8,$Xi -	  psrldq	\$8,$T2			#	 -	  pxor		$T1,$Xi -	  pxor		$T2,$Xhi		# +	  psrldq	\$8,$T1			#	 +	  pxor		$T2,$Xi +	pshufd		\$0b01001110,$Xhn,$Xmn +	  pxor		$T1,$Xhi		# +	pxor		$Xhn,$Xmn		# -	pclmulqdq	\$0x11,$Hkey,$Xhn	#######  	  movdqa	$Xi,$T2			# 2nd phase +	  psrlq		\$1,$Xi +	pclmulqdq	\$0x11,$Hkey,$Xhn	####### +	  pxor		$T2,$Xhi		# +	  pxor		$Xi,$T2  	  psrlq		\$5,$Xi  	  pxor		$T2,$Xi			# +	lea		32($inp),$inp  	  psrlq		\$1,$Xi			# -	  pxor		$T2,$Xi			# -	  pxor		$Xhi,$T2 -	  psrlq		\$1,$Xi			# -	  pxor		$T2,$Xi			# +	pclmulqdq	\$0x00,$HK,$Xmn		####### +	  pxor		$Xhi,$Xi		# -	pclmulqdq	\$0x00,$T2n,$T1n	####### -	 movdqa		$Xi,$Xhi		# -	 pshufd		\$0b01001110,$Xi,$T1 -	 pshufd		\$0b01001110,$Hkey2,$T2 -	 pxor		$Xi,$T1			# -	 pxor		$Hkey2,$T2 - -	pxor		$Xn,$T1n		# -	pxor		$Xhn,$T1n		# -	movdqa		$T1n,$T2n		# -	psrldq		\$8,$T1n -	pslldq		\$8,$T2n		# -	pxor		$T1n,$Xhn -	pxor		$T2n,$Xn		# - -	lea		32($inp),$inp  	sub		\$0x20,$len  	ja		.Lmod_loop  .Leven_tail: -___ -	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi) -$code.=<<___; -	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi) +	 movdqa		$Xi,$Xhi +	 movdqa		$Xmn,$T1 +	 pshufd		\$0b01001110,$Xi,$Xmn	# +	 pxor		$Xi,$Xmn		# + +	pclmulqdq	\$0x00,$Hkey2,$Xi +	pclmulqdq	\$0x11,$Hkey2,$Xhi +	pclmulqdq	\$0x10,$HK,$Xmn + +	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)  	pxor		$Xhn,$Xhi +	pxor		$Xi,$T1 +	pxor		$Xhi,$T1 +	pxor		$T1,$Xmn +	movdqa		$Xmn,$T1		# +	psrldq		\$8,$T1 +	pslldq		\$8,$Xmn		# +	pxor		$T1,$Xhi +	pxor		$Xmn,$Xi		#  ___  	&reduction_alg9	($Xhi,$Xi);  $code.=<<___; @@ -599,7 +937,7 @@ $code.=<<___;  	pshufb		$T3,$T1  	pxor		$T1,$Xi			# Ii+Xi  ___ -	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi) +	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)  	&reduction_alg9	($Xhi,$Xi);  $code.=<<___;  .Ldone: @@ -612,21 +950,607 @@ $code.=<<___ if ($win64);  	movaps	0x20(%rsp),%xmm8  	movaps	0x30(%rsp),%xmm9  	movaps	0x40(%rsp),%xmm10 -	add	\$0x58,%rsp +	movaps	0x50(%rsp),%xmm11 +	movaps	0x60(%rsp),%xmm12 +	movaps	0x70(%rsp),%xmm13 +	movaps	0x80(%rsp),%xmm14 +	movaps	0x90(%rsp),%xmm15 +	lea	0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_clmul:  ___  $code.=<<___;  	ret -.LSEH_end_gcm_ghash_clmul:  .size	gcm_ghash_clmul,.-gcm_ghash_clmul  ___  } + +$code.=<<___; +.globl	gcm_init_avx +.type	gcm_init_avx,\@abi-omnipotent +.align	32 +gcm_init_avx: +___ +if ($avx) { +my ($Htbl,$Xip)=@_4args; +my $HK="%xmm6"; + +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_avx: +	# I can't trust assembler to use specific encoding:-( +	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp +	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp) +___ +$code.=<<___; +	vzeroupper + +	vmovdqu		($Xip),$Hkey +	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap + +	# <<1 twist +	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword +	vpsrlq		\$63,$Hkey,$T1 +	vpsllq		\$1,$Hkey,$Hkey +	vpxor		$T3,$T3,$T3		# +	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit +	vpslldq		\$8,$T1,$T1 +	vpor		$T1,$Hkey,$Hkey		# H<<=1 + +	# magic reduction +	vpand		.L0x1c2_polynomial(%rip),$T3,$T3 +	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial + +	vpunpckhqdq	$Hkey,$Hkey,$HK +	vmovdqa		$Hkey,$Xi +	vpxor		$Hkey,$HK,$HK +	mov		\$4,%r10		# up to H^8 +	jmp		.Linit_start_avx +___ + +sub clmul64x64_avx { +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) {	$HK = $T2; +$code.=<<___; +	vpunpckhqdq	$Xi,$Xi,$T1 +	vpunpckhqdq	$Hkey,$Hkey,$T2 +	vpxor		$Xi,$T1,$T1		# +	vpxor		$Hkey,$T2,$T2 +___ +} else { +$code.=<<___; +	vpunpckhqdq	$Xi,$Xi,$T1 +	vpxor		$Xi,$T1,$T1		# +___ +} +$code.=<<___; +	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	####### +	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	####### +	vpclmulqdq	\$0x00,$HK,$T1,$T1	####### +	vpxor		$Xi,$Xhi,$T2		# +	vpxor		$T2,$T1,$T1		# + +	vpslldq		\$8,$T1,$T2		# +	vpsrldq		\$8,$T1,$T1 +	vpxor		$T2,$Xi,$Xi		# +	vpxor		$T1,$Xhi,$Xhi +___ +} + +sub reduction_avx { +my ($Xhi,$Xi) = @_; + +$code.=<<___; +	vpsllq		\$57,$Xi,$T1		# 1st phase +	vpsllq		\$62,$Xi,$T2 +	vpxor		$T1,$T2,$T2		# +	vpsllq		\$63,$Xi,$T1 +	vpxor		$T1,$T2,$T2		# +	vpslldq		\$8,$T2,$T1		# +	vpsrldq		\$8,$T2,$T2 +	vpxor		$T1,$Xi,$Xi		# +	vpxor		$T2,$Xhi,$Xhi + +	vpsrlq		\$1,$Xi,$T2		# 2nd phase +	vpxor		$Xi,$Xhi,$Xhi +	vpxor		$T2,$Xi,$Xi		# +	vpsrlq		\$5,$T2,$T2 +	vpxor		$T2,$Xi,$Xi		# +	vpsrlq		\$1,$Xi,$Xi		# +	vpxor		$Xhi,$Xi,$Xi		# +___ +}  $code.=<<___; +.align	32 +.Linit_loop_avx: +	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi... +	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt" +___ +	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7 +	&reduction_avx	($Xhi,$Xi); +$code.=<<___; +.Linit_start_avx: +	vmovdqa		$Xi,$T3 +___ +	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8 +	&reduction_avx	($Xhi,$Xi); +$code.=<<___; +	vpshufd		\$0b01001110,$T3,$T1 +	vpshufd		\$0b01001110,$Xi,$T2 +	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing +	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7 +	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing +	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8 +	lea		0x30($Htbl),$Htbl +	sub		\$1,%r10 +	jnz		.Linit_loop_avx + +	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped +	vmovdqu		$T3,-0x10($Htbl) + +	vzeroupper +___ +$code.=<<___ if ($win64); +	movaps	(%rsp),%xmm6 +	lea	0x18(%rsp),%rsp +.LSEH_end_gcm_init_avx: +___ +$code.=<<___; +	ret +.size	gcm_init_avx,.-gcm_init_avx +___ +} else { +$code.=<<___; +	jmp	.L_init_clmul +.size	gcm_init_avx,.-gcm_init_avx +___ +} + +$code.=<<___; +.globl	gcm_gmult_avx +.type	gcm_gmult_avx,\@abi-omnipotent +.align	32 +gcm_gmult_avx: +	jmp	.L_gmult_clmul +.size	gcm_gmult_avx,.-gcm_gmult_avx +___ + +$code.=<<___; +.globl	gcm_ghash_avx +.type	gcm_ghash_avx,\@abi-omnipotent +.align	32 +gcm_ghash_avx: +___ +if ($avx) { +my ($Xip,$Htbl,$inp,$len)=@_4args; +my ($Xlo,$Xhi,$Xmi, +    $Zlo,$Zhi,$Zmi, +    $Hkey,$HK,$T1,$T2, +    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); + +$code.=<<___ if ($win64); +	lea	-0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_avx: +	# I can't trust assembler to use specific encoding:-( +	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp +	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax) +	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax) +	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax) +	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax) +	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax) +	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax) +	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax) +	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax) +	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax) +	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax) +___ +$code.=<<___; +	vzeroupper + +	vmovdqu		($Xip),$Xi		# load $Xi +	lea		.L0x1c2_polynomial(%rip),%r10 +	lea		0x40($Htbl),$Htbl	# size optimization +	vmovdqu		.Lbswap_mask(%rip),$bswap +	vpshufb		$bswap,$Xi,$Xi +	cmp		\$0x80,$len +	jb		.Lshort_avx +	sub		\$0x80,$len + +	vmovdqu		0x70($inp),$Ii		# I[7] +	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1 +	vpshufb		$bswap,$Ii,$Ii +	vmovdqu		0x20-0x40($Htbl),$HK + +	vpunpckhqdq	$Ii,$Ii,$T2 +	 vmovdqu	0x60($inp),$Ij		# I[6] +	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	vpxor		$Ii,$T2,$T2 +	 vpshufb	$bswap,$Ij,$Ij +	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2 +	 vpunpckhqdq	$Ij,$Ij,$T1 +	 vmovdqu	0x50($inp),$Ii		# I[5] +	vpclmulqdq	\$0x00,$HK,$T2,$Xmi +	 vpxor		$Ij,$T1,$T1 + +	 vpshufb	$bswap,$Ii,$Ii +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	 vpunpckhqdq	$Ii,$Ii,$T2 +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3 +	 vpxor		$Ii,$T2,$T2 +	 vmovdqu	0x40($inp),$Ij		# I[4] +	vpclmulqdq	\$0x10,$HK,$T1,$Zmi +	 vmovdqu	0x50-0x40($Htbl),$HK + +	 vpshufb	$bswap,$Ij,$Ij +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	vpxor		$Xhi,$Zhi,$Zhi +	 vpunpckhqdq	$Ij,$Ij,$T1 +	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4 +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T2,$Xmi +	 vpxor		$Ij,$T1,$T1 + +	 vmovdqu	0x30($inp),$Ii		# I[3] +	vpxor		$Zlo,$Xlo,$Xlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	vpxor		$Zhi,$Xhi,$Xhi +	 vpshufb	$bswap,$Ii,$Ii +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5 +	vpxor		$Zmi,$Xmi,$Xmi +	 vpunpckhqdq	$Ii,$Ii,$T2 +	vpclmulqdq	\$0x10,$HK,$T1,$Zmi +	 vmovdqu	0x80-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 + +	 vmovdqu	0x20($inp),$Ij		# I[2] +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	vpxor		$Xhi,$Zhi,$Zhi +	 vpshufb	$bswap,$Ij,$Ij +	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6 +	vpxor		$Xmi,$Zmi,$Zmi +	 vpunpckhqdq	$Ij,$Ij,$T1 +	vpclmulqdq	\$0x00,$HK,$T2,$Xmi +	 vpxor		$Ij,$T1,$T1 + +	 vmovdqu	0x10($inp),$Ii		# I[1] +	vpxor		$Zlo,$Xlo,$Xlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	vpxor		$Zhi,$Xhi,$Xhi +	 vpshufb	$bswap,$Ii,$Ii +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7 +	vpxor		$Zmi,$Xmi,$Xmi +	 vpunpckhqdq	$Ii,$Ii,$T2 +	vpclmulqdq	\$0x10,$HK,$T1,$Zmi +	 vmovdqu	0xb0-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 + +	 vmovdqu	($inp),$Ij		# I[0] +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	vpxor		$Xhi,$Zhi,$Zhi +	 vpshufb	$bswap,$Ij,$Ij +	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8 +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x10,$HK,$T2,$Xmi + +	lea		0x80($inp),$inp +	cmp		\$0x80,$len +	jb		.Ltail_avx + +	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi +	sub		\$0x80,$len +	jmp		.Loop8x_avx + +.align	32 +.Loop8x_avx: +	vpunpckhqdq	$Ij,$Ij,$T1 +	 vmovdqu	0x70($inp),$Ii		# I[7] +	vpxor		$Xlo,$Zlo,$Zlo +	vpxor		$Ij,$T1,$T1 +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi +	 vpshufb	$bswap,$Ii,$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo +	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1 +	 vpunpckhqdq	$Ii,$Ii,$T2 +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Tred +	 vmovdqu	0x20-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 + +	  vmovdqu	0x60($inp),$Ij		# I[6] +	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	vpxor		$Zlo,$Xi,$Xi		# collect result +	  vpshufb	$bswap,$Ij,$Ij +	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	vxorps		$Zhi,$Xo,$Xo +	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2 +	 vpunpckhqdq	$Ij,$Ij,$T1 +	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi +	vpxor		$Zmi,$Tred,$Tred +	 vxorps		$Ij,$T1,$T1 + +	  vmovdqu	0x50($inp),$Ii		# I[5] +	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing +	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	vpxor		$Xo,$Tred,$Tred +	vpslldq		\$8,$Tred,$T2 +	 vpxor		$Xlo,$Zlo,$Zlo +	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	vpsrldq		\$8,$Tred,$Tred +	vpxor		$T2, $Xi, $Xi +	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3 +	  vpshufb	$bswap,$Ii,$Ii +	vxorps		$Tred,$Xo, $Xo +	 vpxor		$Xhi,$Zhi,$Zhi +	 vpunpckhqdq	$Ii,$Ii,$T2 +	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi +	  vmovdqu	0x50-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 +	 vpxor		$Xmi,$Zmi,$Zmi + +	  vmovdqu	0x40($inp),$Ij		# I[4] +	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase +	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	  vpshufb	$bswap,$Ij,$Ij +	 vpxor		$Zlo,$Xlo,$Xlo +	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4 +	 vpunpckhqdq	$Ij,$Ij,$T1 +	 vpxor		$Zhi,$Xhi,$Xhi +	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi +	 vxorps		$Ij,$T1,$T1 +	 vpxor		$Zmi,$Xmi,$Xmi + +	  vmovdqu	0x30($inp),$Ii		# I[3] +	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi +	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	  vpshufb	$bswap,$Ii,$Ii +	 vpxor		$Xlo,$Zlo,$Zlo +	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5 +	 vpunpckhqdq	$Ii,$Ii,$T2 +	 vpxor		$Xhi,$Zhi,$Zhi +	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi +	  vmovdqu	0x80-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 +	 vpxor		$Xmi,$Zmi,$Zmi + +	  vmovdqu	0x20($inp),$Ij		# I[2] +	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	  vpshufb	$bswap,$Ij,$Ij +	 vpxor		$Zlo,$Xlo,$Xlo +	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6 +	 vpunpckhqdq	$Ij,$Ij,$T1 +	 vpxor		$Zhi,$Xhi,$Xhi +	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi +	 vpxor		$Ij,$T1,$T1 +	 vpxor		$Zmi,$Xmi,$Xmi +	vxorps		$Tred,$Xi,$Xi + +	  vmovdqu	0x10($inp),$Ii		# I[1] +	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase +	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo +	  vpshufb	$bswap,$Ii,$Ii +	 vpxor		$Xlo,$Zlo,$Zlo +	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi +	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7 +	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi +	vxorps		$Xo,$Tred,$Tred +	 vpunpckhqdq	$Ii,$Ii,$T2 +	 vpxor		$Xhi,$Zhi,$Zhi +	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi +	  vmovdqu	0xb0-0x40($Htbl),$HK +	 vpxor		$Ii,$T2,$T2 +	 vpxor		$Xmi,$Zmi,$Zmi + +	  vmovdqu	($inp),$Ij		# I[0] +	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo +	  vpshufb	$bswap,$Ij,$Ij +	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi +	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8 +	vpxor		$Tred,$Ij,$Ij +	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi +	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi + +	lea		0x80($inp),$inp +	sub		\$0x80,$len +	jnc		.Loop8x_avx + +	add		\$0x80,$len +	jmp		.Ltail_no_xor_avx + +.align	32 +.Lshort_avx: +	vmovdqu		-0x10($inp,$len),$Ii	# very last word +	lea		($inp,$len),$inp +	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1 +	vmovdqu		0x20-0x40($Htbl),$HK +	vpshufb		$bswap,$Ii,$Ij + +	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo, +	vmovdqa		$Xhi,$Zhi		# $Zhi and +	vmovdqa		$Xmi,$Zmi		# $Zmi +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x20($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vpsrldq		\$8,$HK,$HK +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x30($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vmovdqu		0x50-0x40($Htbl),$HK +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x40($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vpsrldq		\$8,$HK,$HK +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x50($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vmovdqu		0x80-0x40($Htbl),$HK +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x60($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vpsrldq		\$8,$HK,$HK +	sub		\$0x10,$len +	jz		.Ltail_avx + +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	 vmovdqu	-0x70($inp),$Ii +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7 +	 vpshufb	$bswap,$Ii,$Ij +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi +	vmovq		0xb8-0x40($Htbl),$HK +	sub		\$0x10,$len +	jmp		.Ltail_avx + +.align	32 +.Ltail_avx: +	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi +.Ltail_no_xor_avx: +	vpunpckhqdq	$Ij,$Ij,$T1 +	vpxor		$Xlo,$Zlo,$Zlo +	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo +	vpxor		$Ij,$T1,$T1 +	vpxor		$Xhi,$Zhi,$Zhi +	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi +	vpxor		$Xmi,$Zmi,$Zmi +	vpclmulqdq	\$0x00,$HK,$T1,$Xmi + +	vmovdqu		(%r10),$Tred + +	vpxor		$Xlo,$Zlo,$Xi +	vpxor		$Xhi,$Zhi,$Xo +	vpxor		$Xmi,$Zmi,$Zmi + +	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing +	vpxor		$Xo, $Zmi,$Zmi +	vpslldq		\$8, $Zmi,$T2 +	vpsrldq		\$8, $Zmi,$Zmi +	vpxor		$T2, $Xi, $Xi +	vpxor		$Zmi,$Xo, $Xo + +	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase +	vpalignr	\$8,$Xi,$Xi,$Xi +	vpxor		$T2,$Xi,$Xi + +	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase +	vpalignr	\$8,$Xi,$Xi,$Xi +	vpxor		$Xo,$Xi,$Xi +	vpxor		$T2,$Xi,$Xi + +	cmp		\$0,$len +	jne		.Lshort_avx + +	vpshufb		$bswap,$Xi,$Xi +	vmovdqu		$Xi,($Xip) +	vzeroupper +___ +$code.=<<___ if ($win64); +	movaps	(%rsp),%xmm6 +	movaps	0x10(%rsp),%xmm7 +	movaps	0x20(%rsp),%xmm8 +	movaps	0x30(%rsp),%xmm9 +	movaps	0x40(%rsp),%xmm10 +	movaps	0x50(%rsp),%xmm11 +	movaps	0x60(%rsp),%xmm12 +	movaps	0x70(%rsp),%xmm13 +	movaps	0x80(%rsp),%xmm14 +	movaps	0x90(%rsp),%xmm15 +	lea	0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_avx: +___ +$code.=<<___; +	ret +.size	gcm_ghash_avx,.-gcm_ghash_avx +___ +} else { +$code.=<<___; +	jmp	.L_ghash_clmul +.size	gcm_ghash_avx,.-gcm_ghash_avx +___ +} + +$code.=<<___;  .align	64  .Lbswap_mask:  	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0  .L0x1c2_polynomial:  	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +	.long	7,0,7,0 +.L7_mask_poly: +	.long	7,0,`0xE1<<1`,0  .align	64  .type	.Lrem_4bit,\@object  .Lrem_4bit: @@ -774,10 +1698,24 @@ se_handler:  	.rva	.LSEH_end_gcm_ghash_4bit  	.rva	.LSEH_info_gcm_ghash_4bit +	.rva	.LSEH_begin_gcm_init_clmul +	.rva	.LSEH_end_gcm_init_clmul +	.rva	.LSEH_info_gcm_init_clmul +  	.rva	.LSEH_begin_gcm_ghash_clmul  	.rva	.LSEH_end_gcm_ghash_clmul  	.rva	.LSEH_info_gcm_ghash_clmul +___ +$code.=<<___	if ($avx); +	.rva	.LSEH_begin_gcm_init_avx +	.rva	.LSEH_end_gcm_init_avx +	.rva	.LSEH_info_gcm_init_clmul +	.rva	.LSEH_begin_gcm_ghash_avx +	.rva	.LSEH_end_gcm_ghash_avx +	.rva	.LSEH_info_gcm_ghash_clmul +___ +$code.=<<___;  .section	.xdata  .align	8  .LSEH_info_gcm_gmult_4bit: @@ -788,14 +1726,23 @@ se_handler:  	.byte	9,0,0,0  	.rva	se_handler  	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData +.LSEH_info_gcm_init_clmul: +	.byte	0x01,0x08,0x03,0x00 +	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6 +	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18  .LSEH_info_gcm_ghash_clmul: -	.byte	0x01,0x1f,0x0b,0x00 -	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10 -	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9 -	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8 -	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7 -	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6 -	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58 +	.byte	0x01,0x33,0x16,0x00 +	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15 +	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14 +	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13 +	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12 +	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11 +	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10 +	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9 +	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8 +	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7 +	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6 +	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8  ___  } diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl new file mode 100755 index 000000000..e76a58c34 --- /dev/null +++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,234 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { +	$SIZE_T=8; +	$LRSAVE=2*$SIZE_T; +	$STU="stdu"; +	$POP="ld"; +	$PUSH="std"; +} elsif ($flavour =~ /32/) { +	$SIZE_T=4; +	$LRSAVE=$SIZE_T; +	$STU="stwu"; +	$POP="lwz"; +	$PUSH="stw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my $vrsave="r12"; + +$code=<<___; +.machine	"any" + +.text + +.globl	.gcm_init_p8 +.align	5 +.gcm_init_p8: +	lis		r0,0xfff0 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$H,0,r4			# load H + +	vspltisb	$xC2,-16		# 0xf0 +	vspltisb	$t0,1			# one +	vaddubm		$xC2,$xC2,$xC2		# 0xe0 +	vxor		$zero,$zero,$zero +	vor		$xC2,$xC2,$t0		# 0xe1 +	vsldoi		$xC2,$xC2,$zero,15	# 0xe1... +	vsldoi		$t1,$zero,$t0,1		# ...1 +	vaddubm		$xC2,$xC2,$xC2		# 0xc2... +	vspltisb	$t2,7 +	vor		$xC2,$xC2,$t1		# 0xc2....01 +	vspltb		$t1,$H,0		# most significant byte +	vsl		$H,$H,$t0		# H<<=1 +	vsrab		$t1,$t1,$t2		# broadcast carry bit +	vand		$t1,$t1,$xC2 +	vxor		$H,$H,$t1		# twisted H + +	vsldoi		$H,$H,$H,8		# twist even more ... +	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0 +	vsldoi		$Hl,$zero,$H,8		# ... and split +	vsldoi		$Hh,$H,$zero,8 + +	stvx_u		$xC2,0,r3		# save pre-computed table +	stvx_u		$Hl,r8,r3 +	stvx_u		$H, r9,r3 +	stvx_u		$Hh,r10,r3 + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,2,0 +	.long		0 +.size	.gcm_init_p8,.-.gcm_init_p8 + +.globl	.gcm_gmult_p8 +.align	5 +.gcm_gmult_p8: +	lis		r0,0xfff8 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$IN,0,$Xip		# load Xi + +	lvx_u		$Hl,r8,$Htbl		# load pre-computed table +	 le?lvsl	$lemask,r0,r0 +	lvx_u		$H, r9,$Htbl +	 le?vspltisb	$t0,0x07 +	lvx_u		$Hh,r10,$Htbl +	 le?vxor	$lemask,$lemask,$t0 +	lvx_u		$xC2,0,$Htbl +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$zero,$zero,$zero + +	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo +	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi +	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi + +	vpmsumd		$t2,$Xl,$xC2		# 1st phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 + +	vsldoi		$Xl,$Xl,$Xl,8 +	vxor		$Xl,$Xl,$t2 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase +	vpmsumd		$Xl,$Xl,$xC2 +	vxor		$t1,$t1,$Xh +	vxor		$Xl,$Xl,$t1 + +	le?vperm	$Xl,$Xl,$Xl,$lemask +	stvx_u		$Xl,0,$Xip		# write out Xi + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,2,0 +	.long		0 +.size	.gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl	.gcm_ghash_p8 +.align	5 +.gcm_ghash_p8: +	lis		r0,0xfff8 +	li		r8,0x10 +	mfspr		$vrsave,256 +	li		r9,0x20 +	mtspr		256,r0 +	li		r10,0x30 +	lvx_u		$Xl,0,$Xip		# load Xi + +	lvx_u		$Hl,r8,$Htbl		# load pre-computed table +	 le?lvsl	$lemask,r0,r0 +	lvx_u		$H, r9,$Htbl +	 le?vspltisb	$t0,0x07 +	lvx_u		$Hh,r10,$Htbl +	 le?vxor	$lemask,$lemask,$t0 +	lvx_u		$xC2,0,$Htbl +	 le?vperm	$Xl,$Xl,$Xl,$lemask +	vxor		$zero,$zero,$zero + +	lvx_u		$IN,0,$inp +	addi		$inp,$inp,16 +	subi		$len,$len,16 +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$IN,$IN,$Xl +	b		Loop + +.align	5 +Loop: +	 subic		$len,$len,16 +	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo +	 subfe.		r0,r0,r0		# borrow?-1:0 +	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi +	 and		r0,r0,$len +	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi +	 add		$inp,$inp,r0 + +	vpmsumd		$t2,$Xl,$xC2		# 1st phase + +	vsldoi		$t0,$Xm,$zero,8 +	vsldoi		$t1,$zero,$Xm,8 +	vxor		$Xl,$Xl,$t0 +	vxor		$Xh,$Xh,$t1 + +	vsldoi		$Xl,$Xl,$Xl,8 +	vxor		$Xl,$Xl,$t2 +	 lvx_u		$IN,0,$inp +	 addi		$inp,$inp,16 + +	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase +	vpmsumd		$Xl,$Xl,$xC2 +	 le?vperm	$IN,$IN,$IN,$lemask +	vxor		$t1,$t1,$Xh +	vxor		$IN,$IN,$t1 +	vxor		$IN,$IN,$Xl +	beq		Loop			# did $len-=16 borrow? + +	vxor		$Xl,$Xl,$t1 +	le?vperm	$Xl,$Xl,$Xl,$lemask +	stvx_u		$Xl,0,$Xip		# write out Xi + +	mtspr		256,$vrsave +	blr +	.long		0 +	.byte		0,12,0x14,0,0,0,4,0 +	.long		0 +.size	.gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" +.align  2 +___ + +foreach (split("\n",$code)) { +	if ($flavour =~ /le$/o) {	# little-endian +	    s/le\?//o		or +	    s/be\?/#be#/o; +	} else { +	    s/le\?/#le#/o	or +	    s/be\?//o; +	} +	print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl new file mode 100755 index 000000000..54a1ac4db --- /dev/null +++ b/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,241 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# Current performance in cycles per processed byte: +# +#		PMULL[2]	32-bit NEON(*) +# Apple A7	1.76		5.62 +# Cortex-A53	1.45		8.39 +# Cortex-A57	2.22		7.61 +# +# (*)	presented for reference/comparison purposes; + +$flavour = shift; +open STDOUT,">".shift; + +$Xi="x0";	# argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/); +$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/); + +$code.=<<___; +.global	gcm_init_v8 +.type	gcm_init_v8,%function +.align	4 +gcm_init_v8: +	vld1.64		{$t1},[x1]		@ load H +	vmov.i8		$t0,#0xe1 +	vext.8		$IN,$t1,$t1,#8 +	vshl.i64	$t0,$t0,#57 +	vshr.u64	$t2,$t0,#63 +	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01 +	vdup.32		$t1,${t1}[1] +	vshr.u64	$t3,$IN,#63 +	vshr.s32	$t1,$t1,#31		@ broadcast carry bit +	vand		$t3,$t3,$t0 +	vshl.i64	$IN,$IN,#1 +	vext.8		$t3,$t3,$t3,#8 +	vand		$t0,$t0,$t1 +	vorr		$IN,$IN,$t3		@ H<<<=1 +	veor		$IN,$IN,$t0		@ twisted H +	vst1.64		{$IN},[x0] + +	ret +.size	gcm_init_v8,.-gcm_init_v8 + +.global	gcm_gmult_v8 +.type	gcm_gmult_v8,%function +.align	4 +gcm_gmult_v8: +	vld1.64		{$t1},[$Xi]		@ load Xi +	vmov.i8		$t3,#0xe1 +	vld1.64		{$H},[$Htbl]		@ load twisted H +	vshl.u64	$t3,$t3,#57 +#ifndef __ARMEB__ +	vrev64.8	$t1,$t1 +#endif +	vext.8		$Hhl,$H,$H,#8 +	mov		$len,#0 +	vext.8		$IN,$t1,$t1,#8 +	mov		$inc,#0 +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing +	mov		$inp,$Xi +	b		.Lgmult_v8 +.size	gcm_gmult_v8,.-gcm_gmult_v8 + +.global	gcm_ghash_v8 +.type	gcm_ghash_v8,%function +.align	4 +gcm_ghash_v8: +	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi +	subs		$len,$len,#16 +	vmov.i8		$t3,#0xe1 +	mov		$inc,#16 +	vld1.64		{$H},[$Htbl]		@ load twisted H +	cclr		$inc,eq +	vext.8		$Xl,$Xl,$Xl,#8 +	vshl.u64	$t3,$t3,#57 +	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp +	vext.8		$Hhl,$H,$H,#8 +#ifndef __ARMEB__ +	vrev64.8	$Xl,$Xl +	vrev64.8	$t1,$t1 +#endif +	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing +	vext.8		$IN,$t1,$t1,#8 +	b		.Loop_v8 + +.align	4 +.Loop_v8: +	vext.8		$t2,$Xl,$Xl,#8 +	veor		$IN,$IN,$Xl		@ inp^=Xi +	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi + +.Lgmult_v8: +	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo +	veor		$t1,$t1,$IN		@ Karatsuba pre-processing +	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi +	subs		$len,$len,#16 +	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi) +	cclr		$inc,eq + +	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing +	veor		$t2,$Xl,$Xh +	veor		$Xm,$Xm,$t1 +	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp +	veor		$Xm,$Xm,$t2 +	vpmull.p64	$t2,$Xl,$t3		@ 1st phase + +	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result +	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl +#ifndef __ARMEB__ +	 vrev64.8	$t1,$t1 +#endif +	veor		$Xl,$Xm,$t2 +	 vext.8		$IN,$t1,$t1,#8 + +	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase +	vpmull.p64	$Xl,$Xl,$t3 +	veor		$t2,$t2,$Xh +	veor		$Xl,$Xl,$t2 +	b.hs		.Loop_v8 + +#ifndef __ARMEB__ +	vrev64.8	$Xl,$Xl +#endif +	vext.8		$Xl,$Xl,$Xl,#8 +	vst1.64		{$Xl},[$Xi]		@ write out Xi + +	ret +.size	gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align  2 +___ + +if ($flavour =~ /64/) {			######## 64-bit code +    sub unvmov { +	my $arg=shift; + +	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && +	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; +    } +    foreach(split("\n",$code)) { +	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or +	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics +	s/vmov\s+(.*)/unvmov($1)/geo	or +	s/vext\.8/ext/o			or +	s/vshr\.s/sshr\.s/o		or +	s/vshr/ushr/o			or +	s/^(\s+)v/$1/o			or	# strip off v prefix +	s/\bbx\s+lr\b/ret/o; + +	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers +	s/@\s/\/\//o;				# old->new style commentary + +	# fix up remainig legacy suffixes +	s/\.[ui]?8(\s)/$1/o; +	s/\.[uis]?32//o and s/\.16b/\.4s/go; +	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument +	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments +	s/\.[uisp]?64//o and s/\.16b/\.2d/go; +	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + +	print $_,"\n"; +    } +} else {				######## 32-bit code +    sub unvdup32 { +	my $arg=shift; + +	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && +	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; +    } +    sub unvpmullp64 { +	my ($mnemonic,$arg)=@_; + +	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { +	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) +				 |(($2&7)<<17)|(($2&8)<<4) +				 |(($3&7)<<1) |(($3&8)<<2); +	    $word |= 0x00010001	 if ($mnemonic =~ "2"); +	    # since ARMv7 instructions are always encoded little-endian. +	    # correct solution is to use .inst directive, but older +	    # assemblers don't implement it:-( +	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", +			$word&0xff,($word>>8)&0xff, +			($word>>16)&0xff,($word>>24)&0xff, +			$mnemonic,$arg; +	} +    } + +    foreach(split("\n",$code)) { +	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers +	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers +        s/\/\/\s?/@ /o;				# new->old style commentary + +	# fix up remainig new-style suffixes +	s/\],#[0-9]+/]!/o; + +	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or +	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or +	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or +	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or +	s/^(\s+)b\./$1b/o						or +	s/^(\s+)ret/$1bx\tlr/o; + +        print $_,"\n"; +    } +} + +close STDOUT; # enforce flush diff --git a/openssl/crypto/modes/cbc128.c b/openssl/crypto/modes/cbc128.c index 0e54f7547..c13caea53 100644 --- a/openssl/crypto/modes/cbc128.c +++ b/openssl/crypto/modes/cbc128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -59,147 +59,149 @@  #endif  #include <assert.h> -#ifndef STRICT_ALIGNMENT -#  define STRICT_ALIGNMENT 0 +#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC) +# define STRICT_ALIGNMENT 0  #endif  void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) +                           size_t len, const void *key, +                           unsigned char ivec[16], block128_f block)  { -	size_t n; -	const unsigned char *iv = ivec; +    size_t n; +    const unsigned char *iv = ivec; -	assert(in && out && key && ivec); +    assert(in && out && key && ivec);  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (STRICT_ALIGNMENT && -	    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) { -		while (len>=16) { -			for(n=0; n<16; ++n) -				out[n] = in[n] ^ iv[n]; -			(*block)(out, out, key); -			iv = out; -			len -= 16; -			in  += 16; -			out += 16; -		} -	} else { -		while (len>=16) { -			for(n=0; n<16; n+=sizeof(size_t)) -				*(size_t*)(out+n) = -				*(size_t*)(in+n) ^ *(size_t*)(iv+n); -			(*block)(out, out, key); -			iv = out; -			len -= 16; -			in  += 16; -			out += 16; -		} -	} +    if (STRICT_ALIGNMENT && +        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { +        while (len >= 16) { +            for (n = 0; n < 16; ++n) +                out[n] = in[n] ^ iv[n]; +            (*block) (out, out, key); +            iv = out; +            len -= 16; +            in += 16; +            out += 16; +        } +    } else { +        while (len >= 16) { +            for (n = 0; n < 16; n += sizeof(size_t)) +                *(size_t *)(out + n) = +                    *(size_t *)(in + n) ^ *(size_t *)(iv + n); +            (*block) (out, out, key); +            iv = out; +            len -= 16; +            in += 16; +            out += 16; +        } +    }  #endif -	while (len) { -		for(n=0; n<16 && n<len; ++n) -			out[n] = in[n] ^ iv[n]; -		for(; n<16; ++n) -			out[n] = iv[n]; -		(*block)(out, out, key); -		iv = out; -		if (len<=16) break; -		len -= 16; -		in  += 16; -		out += 16; -	} -	memcpy(ivec,iv,16); +    while (len) { +        for (n = 0; n < 16 && n < len; ++n) +            out[n] = in[n] ^ iv[n]; +        for (; n < 16; ++n) +            out[n] = iv[n]; +        (*block) (out, out, key); +        iv = out; +        if (len <= 16) +            break; +        len -= 16; +        in += 16; +        out += 16; +    } +    memcpy(ivec, iv, 16);  }  void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) +                           size_t len, const void *key, +                           unsigned char ivec[16], block128_f block)  { -	size_t n; -	union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp; +    size_t n; +    union { +        size_t t[16 / sizeof(size_t)]; +        unsigned char c[16]; +    } tmp; -	assert(in && out && key && ivec); +    assert(in && out && key && ivec);  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (in != out) { -		const unsigned char *iv = ivec; +    if (in != out) { +        const unsigned char *iv = ivec; -		if (STRICT_ALIGNMENT && -		    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) { -			while (len>=16) { -				(*block)(in, out, key); -				for(n=0; n<16; ++n) -					out[n] ^= iv[n]; -				iv = in; -				len -= 16; -				in  += 16; -				out += 16; -			} -		} -		else  if (16%sizeof(size_t) == 0) { /* always true */ -			while (len>=16) { -				size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv; +        if (STRICT_ALIGNMENT && +            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { +            while (len >= 16) { +                (*block) (in, out, key); +                for (n = 0; n < 16; ++n) +                    out[n] ^= iv[n]; +                iv = in; +                len -= 16; +                in += 16; +                out += 16; +            } +        } else if (16 % sizeof(size_t) == 0) { /* always true */ +            while (len >= 16) { +                size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv; -				(*block)(in, out, key); -				for(n=0; n<16/sizeof(size_t); n++) -					out_t[n] ^= iv_t[n]; -				iv = in; -				len -= 16; -				in  += 16; -				out += 16; -			} -		} -		memcpy(ivec,iv,16); -	} else { -		if (STRICT_ALIGNMENT && -		    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) { -			unsigned char c; -			while (len>=16) { -				(*block)(in, tmp.c, key); -				for(n=0; n<16; ++n) { -					c = in[n]; -					out[n] = tmp.c[n] ^ ivec[n]; -					ivec[n] = c; -				} -				len -= 16; -				in  += 16; -				out += 16; -			} -		} -		else if (16%sizeof(size_t) == 0) { /* always true */ -			while (len>=16) { -				size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec; -				const size_t *in_t=(const size_t *)in; +                (*block) (in, out, key); +                for (n = 0; n < 16 / sizeof(size_t); n++) +                    out_t[n] ^= iv_t[n]; +                iv = in; +                len -= 16; +                in += 16; +                out += 16; +            } +        } +        memcpy(ivec, iv, 16); +    } else { +        if (STRICT_ALIGNMENT && +            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { +            unsigned char c; +            while (len >= 16) { +                (*block) (in, tmp.c, key); +                for (n = 0; n < 16; ++n) { +                    c = in[n]; +                    out[n] = tmp.c[n] ^ ivec[n]; +                    ivec[n] = c; +                } +                len -= 16; +                in += 16; +                out += 16; +            } +        } else if (16 % sizeof(size_t) == 0) { /* always true */ +            while (len >= 16) { +                size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec; +                const size_t *in_t = (const size_t *)in; -				(*block)(in, tmp.c, key); -				for(n=0; n<16/sizeof(size_t); n++) { -					c = in_t[n]; -					out_t[n] = tmp.t[n] ^ ivec_t[n]; -					ivec_t[n] = c; -				} -				len -= 16; -				in  += 16; -				out += 16; -			} -		} -	} +                (*block) (in, tmp.c, key); +                for (n = 0; n < 16 / sizeof(size_t); n++) { +                    c = in_t[n]; +                    out_t[n] = tmp.t[n] ^ ivec_t[n]; +                    ivec_t[n] = c; +                } +                len -= 16; +                in += 16; +                out += 16; +            } +        } +    }  #endif -	while (len) { -		unsigned char c; -		(*block)(in, tmp.c, key); -		for(n=0; n<16 && n<len; ++n) { -			c = in[n]; -			out[n] = tmp.c[n] ^ ivec[n]; -			ivec[n] = c; -		} -		if (len<=16) { -			for (; n<16; ++n) -				ivec[n] = in[n]; -			break; -		} -		len -= 16; -		in  += 16; -		out += 16; -	} +    while (len) { +        unsigned char c; +        (*block) (in, tmp.c, key); +        for (n = 0; n < 16 && n < len; ++n) { +            c = in[n]; +            out[n] = tmp.c[n] ^ ivec[n]; +            ivec[n] = c; +        } +        if (len <= 16) { +            for (; n < 16; ++n) +                ivec[n] = in[n]; +            break; +        } +        len -= 16; +        in += 16; +        out += 16; +    }  } diff --git a/openssl/crypto/modes/ccm128.c b/openssl/crypto/modes/ccm128.c index 3ce11d0d9..c1ded0f91 100644 --- a/openssl/crypto/modes/ccm128.c +++ b/openssl/crypto/modes/ccm128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -58,384 +58,422 @@  #endif  #include <assert.h> -/* First you setup M and L parameters and pass the key schedule. - * This is called once per session setup... */ +/* + * First you setup M and L parameters and pass the key schedule. This is + * called once per session setup... + */  void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, -	unsigned int M,unsigned int L,void *key,block128_f block) +                        unsigned int M, unsigned int L, void *key, +                        block128_f block)  { -	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c)); -	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3; -	ctx->blocks = 0; -	ctx->block = block; -	ctx->key = key; +    memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c)); +    ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3; +    ctx->blocks = 0; +    ctx->block = block; +    ctx->key = key;  }  /* !!! Following interfaces are to be called *once* per packet !!! */  /* Then you setup per-message nonce and pass the length of the message */  int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, -	const unsigned char *nonce,size_t nlen,size_t mlen) +                        const unsigned char *nonce, size_t nlen, size_t mlen)  { -	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */ +    unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */ -	if (nlen<(14-L)) return -1;		/* nonce is too short */ +    if (nlen < (14 - L)) +        return -1;              /* nonce is too short */ -	if (sizeof(mlen)==8 && L>=3) { -		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8))); -		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8))); -		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8))); -		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8))); -	} -	else -		ctx->nonce.u[1] = 0; +    if (sizeof(mlen) == 8 && L >= 3) { +        ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8))); +        ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8))); +        ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8))); +        ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8))); +    } else +        ctx->nonce.u[1] = 0; -	ctx->nonce.c[12] = (u8)(mlen>>24); -	ctx->nonce.c[13] = (u8)(mlen>>16); -	ctx->nonce.c[14] = (u8)(mlen>>8); -	ctx->nonce.c[15] = (u8)mlen; +    ctx->nonce.c[12] = (u8)(mlen >> 24); +    ctx->nonce.c[13] = (u8)(mlen >> 16); +    ctx->nonce.c[14] = (u8)(mlen >> 8); +    ctx->nonce.c[15] = (u8)mlen; -	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */ -	memcpy(&ctx->nonce.c[1],nonce,14-L); +    ctx->nonce.c[0] &= ~0x40;   /* clear Adata flag */ +    memcpy(&ctx->nonce.c[1], nonce, 14 - L); -	return 0; +    return 0;  }  /* Then you pass additional authentication data, this is optional */  void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, -	const unsigned char *aad,size_t alen) -{	unsigned int i; -	block128_f block = ctx->block; - -	if (alen==0) return; - -	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */ -	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key), -	ctx->blocks++; - -	if (alen<(0x10000-0x100)) { -		ctx->cmac.c[0] ^= (u8)(alen>>8); -		ctx->cmac.c[1] ^= (u8)alen; -		i=2; -	} -	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) { -		ctx->cmac.c[0] ^= 0xFF; -		ctx->cmac.c[1] ^= 0xFF; -		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8))); -		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8))); -		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8))); -		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8))); -		ctx->cmac.c[6] ^= (u8)(alen>>24); -		ctx->cmac.c[7] ^= (u8)(alen>>16); -		ctx->cmac.c[8] ^= (u8)(alen>>8); -		ctx->cmac.c[9] ^= (u8)alen; -		i=10; -	} -	else { -		ctx->cmac.c[0] ^= 0xFF; -		ctx->cmac.c[1] ^= 0xFE; -		ctx->cmac.c[2] ^= (u8)(alen>>24); -		ctx->cmac.c[3] ^= (u8)(alen>>16); -		ctx->cmac.c[4] ^= (u8)(alen>>8); -		ctx->cmac.c[5] ^= (u8)alen; -		i=6; -	} - -	do { -		for(;i<16 && alen;++i,++aad,--alen) -			ctx->cmac.c[i] ^= *aad; -		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key), -		ctx->blocks++; -		i=0; -	} while (alen); +                       const unsigned char *aad, size_t alen) +{ +    unsigned int i; +    block128_f block = ctx->block; + +    if (alen == 0) +        return; + +    ctx->nonce.c[0] |= 0x40;    /* set Adata flag */ +    (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++; + +    if (alen < (0x10000 - 0x100)) { +        ctx->cmac.c[0] ^= (u8)(alen >> 8); +        ctx->cmac.c[1] ^= (u8)alen; +        i = 2; +    } else if (sizeof(alen) == 8 +               && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) { +        ctx->cmac.c[0] ^= 0xFF; +        ctx->cmac.c[1] ^= 0xFF; +        ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8))); +        ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8))); +        ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8))); +        ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8))); +        ctx->cmac.c[6] ^= (u8)(alen >> 24); +        ctx->cmac.c[7] ^= (u8)(alen >> 16); +        ctx->cmac.c[8] ^= (u8)(alen >> 8); +        ctx->cmac.c[9] ^= (u8)alen; +        i = 10; +    } else { +        ctx->cmac.c[0] ^= 0xFF; +        ctx->cmac.c[1] ^= 0xFE; +        ctx->cmac.c[2] ^= (u8)(alen >> 24); +        ctx->cmac.c[3] ^= (u8)(alen >> 16); +        ctx->cmac.c[4] ^= (u8)(alen >> 8); +        ctx->cmac.c[5] ^= (u8)alen; +        i = 6; +    } + +    do { +        for (; i < 16 && alen; ++i, ++aad, --alen) +            ctx->cmac.c[i] ^= *aad; +        (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++; +        i = 0; +    } while (alen);  }  /* Finally you encrypt or decrypt the message */ -/* counter part of nonce may not be larger than L*8 bits, - * L is not larger than 8, therefore 64-bit counter... */ -static void ctr64_inc(unsigned char *counter) { -	unsigned int n=8; -	u8  c; - -	counter += 8; -	do { -		--n; -		c = counter[n]; -		++c; -		counter[n] = c; -		if (c) return; -	} while (n); +/* + * counter part of nonce may not be larger than L*8 bits, L is not larger + * than 8, therefore 64-bit counter... + */ +static void ctr64_inc(unsigned char *counter) +{ +    unsigned int n = 8; +    u8 c; + +    counter += 8; +    do { +        --n; +        c = counter[n]; +        ++c; +        counter[n] = c; +        if (c) +            return; +    } while (n);  }  int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, -	size_t len) +                          const unsigned char *inp, unsigned char *out, +                          size_t len)  { -	size_t		n; -	unsigned int	i,L; -	unsigned char	flags0	= ctx->nonce.c[0]; -	block128_f	block	= ctx->block; -	void *		key	= ctx->key; -	union { u64 u[2]; u8 c[16]; } scratch; - -	if (!(flags0&0x40)) -		(*block)(ctx->nonce.c,ctx->cmac.c,key), -		ctx->blocks++; - -	ctx->nonce.c[0] = L = flags0&7; -	for (n=0,i=15-L;i<15;++i) { -		n |= ctx->nonce.c[i]; -		ctx->nonce.c[i]=0; -		n <<= 8; -	} -	n |= ctx->nonce.c[15];	/* reconstructed length */ -	ctx->nonce.c[15]=1; - -	if (n!=len) return -1;	/* length mismatch */ - -	ctx->blocks += ((len+15)>>3)|1; -	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */ - -	while (len>=16) { +    size_t n; +    unsigned int i, L; +    unsigned char flags0 = ctx->nonce.c[0]; +    block128_f block = ctx->block; +    void *key = ctx->key; +    union { +        u64 u[2]; +        u8 c[16]; +    } scratch; + +    if (!(flags0 & 0x40)) +        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++; + +    ctx->nonce.c[0] = L = flags0 & 7; +    for (n = 0, i = 15 - L; i < 15; ++i) { +        n |= ctx->nonce.c[i]; +        ctx->nonce.c[i] = 0; +        n <<= 8; +    } +    n |= ctx->nonce.c[15];      /* reconstructed length */ +    ctx->nonce.c[15] = 1; + +    if (n != len) +        return -1;              /* length mismatch */ + +    ctx->blocks += ((len + 15) >> 3) | 1; +    if (ctx->blocks > (U64(1) << 61)) +        return -2;              /* too much data */ + +    while (len >= 16) {  #if defined(STRICT_ALIGNMENT) -		union { u64 u[2]; u8 c[16]; } temp; - -		memcpy (temp.c,inp,16); -		ctx->cmac.u[0] ^= temp.u[0]; -		ctx->cmac.u[1] ^= temp.u[1]; +        union { +            u64 u[2]; +            u8 c[16]; +        } temp; + +        memcpy(temp.c, inp, 16); +        ctx->cmac.u[0] ^= temp.u[0]; +        ctx->cmac.u[1] ^= temp.u[1];  #else -		ctx->cmac.u[0] ^= ((u64*)inp)[0]; -		ctx->cmac.u[1] ^= ((u64*)inp)[1]; +        ctx->cmac.u[0] ^= ((u64 *)inp)[0]; +        ctx->cmac.u[1] ^= ((u64 *)inp)[1];  #endif -		(*block)(ctx->cmac.c,ctx->cmac.c,key); -		(*block)(ctx->nonce.c,scratch.c,key); -		ctr64_inc(ctx->nonce.c); +        (*block) (ctx->cmac.c, ctx->cmac.c, key); +        (*block) (ctx->nonce.c, scratch.c, key); +        ctr64_inc(ctx->nonce.c);  #if defined(STRICT_ALIGNMENT) -		temp.u[0] ^= scratch.u[0]; -		temp.u[1] ^= scratch.u[1]; -		memcpy(out,temp.c,16); +        temp.u[0] ^= scratch.u[0]; +        temp.u[1] ^= scratch.u[1]; +        memcpy(out, temp.c, 16);  #else -		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]; -		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]; +        ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]; +        ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];  #endif -		inp += 16; -		out += 16; -		len -= 16; -	} - -	if (len) { -		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; -		(*block)(ctx->cmac.c,ctx->cmac.c,key); -		(*block)(ctx->nonce.c,scratch.c,key); -		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; -	} - -	for (i=15-L;i<16;++i) -		ctx->nonce.c[i]=0; - -	(*block)(ctx->nonce.c,scratch.c,key); -	ctx->cmac.u[0] ^= scratch.u[0]; -	ctx->cmac.u[1] ^= scratch.u[1]; - -	ctx->nonce.c[0] = flags0; - -	return 0; +        inp += 16; +        out += 16; +        len -= 16; +    } + +    if (len) { +        for (i = 0; i < len; ++i) +            ctx->cmac.c[i] ^= inp[i]; +        (*block) (ctx->cmac.c, ctx->cmac.c, key); +        (*block) (ctx->nonce.c, scratch.c, key); +        for (i = 0; i < len; ++i) +            out[i] = scratch.c[i] ^ inp[i]; +    } + +    for (i = 15 - L; i < 16; ++i) +        ctx->nonce.c[i] = 0; + +    (*block) (ctx->nonce.c, scratch.c, key); +    ctx->cmac.u[0] ^= scratch.u[0]; +    ctx->cmac.u[1] ^= scratch.u[1]; + +    ctx->nonce.c[0] = flags0; + +    return 0;  }  int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, -	size_t len) +                          const unsigned char *inp, unsigned char *out, +                          size_t len)  { -	size_t		n; -	unsigned int	i,L; -	unsigned char	flags0	= ctx->nonce.c[0]; -	block128_f	block	= ctx->block; -	void *		key	= ctx->key; -	union { u64 u[2]; u8 c[16]; } scratch; - -	if (!(flags0&0x40)) -		(*block)(ctx->nonce.c,ctx->cmac.c,key); - -	ctx->nonce.c[0] = L = flags0&7; -	for (n=0,i=15-L;i<15;++i) { -		n |= ctx->nonce.c[i]; -		ctx->nonce.c[i]=0; -		n <<= 8; -	} -	n |= ctx->nonce.c[15];	/* reconstructed length */ -	ctx->nonce.c[15]=1; - -	if (n!=len) return -1; - -	while (len>=16) { +    size_t n; +    unsigned int i, L; +    unsigned char flags0 = ctx->nonce.c[0]; +    block128_f block = ctx->block; +    void *key = ctx->key; +    union { +        u64 u[2]; +        u8 c[16]; +    } scratch; + +    if (!(flags0 & 0x40)) +        (*block) (ctx->nonce.c, ctx->cmac.c, key); + +    ctx->nonce.c[0] = L = flags0 & 7; +    for (n = 0, i = 15 - L; i < 15; ++i) { +        n |= ctx->nonce.c[i]; +        ctx->nonce.c[i] = 0; +        n <<= 8; +    } +    n |= ctx->nonce.c[15];      /* reconstructed length */ +    ctx->nonce.c[15] = 1; + +    if (n != len) +        return -1; + +    while (len >= 16) {  #if defined(STRICT_ALIGNMENT) -		union { u64 u[2]; u8 c[16]; } temp; +        union { +            u64 u[2]; +            u8 c[16]; +        } temp;  #endif -		(*block)(ctx->nonce.c,scratch.c,key); -		ctr64_inc(ctx->nonce.c); +        (*block) (ctx->nonce.c, scratch.c, key); +        ctr64_inc(ctx->nonce.c);  #if defined(STRICT_ALIGNMENT) -		memcpy (temp.c,inp,16); -		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); -		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); -		memcpy (out,scratch.c,16); +        memcpy(temp.c, inp, 16); +        ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); +        ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); +        memcpy(out, scratch.c, 16);  #else -		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]); -		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]); +        ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]); +        ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]);  #endif -		(*block)(ctx->cmac.c,ctx->cmac.c,key); +        (*block) (ctx->cmac.c, ctx->cmac.c, key); -		inp += 16; -		out += 16; -		len -= 16; -	} +        inp += 16; +        out += 16; +        len -= 16; +    } -	if (len) { -		(*block)(ctx->nonce.c,scratch.c,key); -		for (i=0; i<len; ++i) -			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); -		(*block)(ctx->cmac.c,ctx->cmac.c,key); -	} +    if (len) { +        (*block) (ctx->nonce.c, scratch.c, key); +        for (i = 0; i < len; ++i) +            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]); +        (*block) (ctx->cmac.c, ctx->cmac.c, key); +    } -	for (i=15-L;i<16;++i) -		ctx->nonce.c[i]=0; +    for (i = 15 - L; i < 16; ++i) +        ctx->nonce.c[i] = 0; -	(*block)(ctx->nonce.c,scratch.c,key); -	ctx->cmac.u[0] ^= scratch.u[0]; -	ctx->cmac.u[1] ^= scratch.u[1]; +    (*block) (ctx->nonce.c, scratch.c, key); +    ctx->cmac.u[0] ^= scratch.u[0]; +    ctx->cmac.u[1] ^= scratch.u[1]; -	ctx->nonce.c[0] = flags0; +    ctx->nonce.c[0] = flags0; -	return 0; +    return 0;  } -static void ctr64_add (unsigned char *counter,size_t inc) -{	size_t n=8, val=0; - -	counter += 8; -	do { -		--n; -		val += counter[n] + (inc&0xff); -		counter[n] = (unsigned char)val; -		val >>= 8;	/* carry bit */ -		inc >>= 8; -	} while(n && (inc || val)); +static void ctr64_add(unsigned char *counter, size_t inc) +{ +    size_t n = 8, val = 0; + +    counter += 8; +    do { +        --n; +        val += counter[n] + (inc & 0xff); +        counter[n] = (unsigned char)val; +        val >>= 8;              /* carry bit */ +        inc >>= 8; +    } while (n && (inc || val));  }  int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, -	size_t len,ccm128_f stream) +                                const unsigned char *inp, unsigned char *out, +                                size_t len, ccm128_f stream)  { -	size_t		n; -	unsigned int	i,L; -	unsigned char	flags0	= ctx->nonce.c[0]; -	block128_f	block	= ctx->block; -	void *		key	= ctx->key; -	union { u64 u[2]; u8 c[16]; } scratch; - -	if (!(flags0&0x40)) -		(*block)(ctx->nonce.c,ctx->cmac.c,key), -		ctx->blocks++; - -	ctx->nonce.c[0] = L = flags0&7; -	for (n=0,i=15-L;i<15;++i) { -		n |= ctx->nonce.c[i]; -		ctx->nonce.c[i]=0; -		n <<= 8; -	} -	n |= ctx->nonce.c[15];	/* reconstructed length */ -	ctx->nonce.c[15]=1; - -	if (n!=len) return -1;	/* length mismatch */ - -	ctx->blocks += ((len+15)>>3)|1; -	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */ - -	if ((n=len/16)) { -		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); -		n   *= 16; -		inp += n; -		out += n; -		len -= n; -		if (len) ctr64_add(ctx->nonce.c,n/16); -	} - -	if (len) { -		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; -		(*block)(ctx->cmac.c,ctx->cmac.c,key); -		(*block)(ctx->nonce.c,scratch.c,key); -		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; -	} - -	for (i=15-L;i<16;++i) -		ctx->nonce.c[i]=0; - -	(*block)(ctx->nonce.c,scratch.c,key); -	ctx->cmac.u[0] ^= scratch.u[0]; -	ctx->cmac.u[1] ^= scratch.u[1]; - -	ctx->nonce.c[0] = flags0; - -	return 0; +    size_t n; +    unsigned int i, L; +    unsigned char flags0 = ctx->nonce.c[0]; +    block128_f block = ctx->block; +    void *key = ctx->key; +    union { +        u64 u[2]; +        u8 c[16]; +    } scratch; + +    if (!(flags0 & 0x40)) +        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++; + +    ctx->nonce.c[0] = L = flags0 & 7; +    for (n = 0, i = 15 - L; i < 15; ++i) { +        n |= ctx->nonce.c[i]; +        ctx->nonce.c[i] = 0; +        n <<= 8; +    } +    n |= ctx->nonce.c[15];      /* reconstructed length */ +    ctx->nonce.c[15] = 1; + +    if (n != len) +        return -1;              /* length mismatch */ + +    ctx->blocks += ((len + 15) >> 3) | 1; +    if (ctx->blocks > (U64(1) << 61)) +        return -2;              /* too much data */ + +    if ((n = len / 16)) { +        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c); +        n *= 16; +        inp += n; +        out += n; +        len -= n; +        if (len) +            ctr64_add(ctx->nonce.c, n / 16); +    } + +    if (len) { +        for (i = 0; i < len; ++i) +            ctx->cmac.c[i] ^= inp[i]; +        (*block) (ctx->cmac.c, ctx->cmac.c, key); +        (*block) (ctx->nonce.c, scratch.c, key); +        for (i = 0; i < len; ++i) +            out[i] = scratch.c[i] ^ inp[i]; +    } + +    for (i = 15 - L; i < 16; ++i) +        ctx->nonce.c[i] = 0; + +    (*block) (ctx->nonce.c, scratch.c, key); +    ctx->cmac.u[0] ^= scratch.u[0]; +    ctx->cmac.u[1] ^= scratch.u[1]; + +    ctx->nonce.c[0] = flags0; + +    return 0;  }  int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, -	size_t len,ccm128_f stream) +                                const unsigned char *inp, unsigned char *out, +                                size_t len, ccm128_f stream)  { -	size_t		n; -	unsigned int	i,L; -	unsigned char	flags0	= ctx->nonce.c[0]; -	block128_f	block	= ctx->block; -	void *		key	= ctx->key; -	union { u64 u[2]; u8 c[16]; } scratch; - -	if (!(flags0&0x40)) -		(*block)(ctx->nonce.c,ctx->cmac.c,key); - -	ctx->nonce.c[0] = L = flags0&7; -	for (n=0,i=15-L;i<15;++i) { -		n |= ctx->nonce.c[i]; -		ctx->nonce.c[i]=0; -		n <<= 8; -	} -	n |= ctx->nonce.c[15];	/* reconstructed length */ -	ctx->nonce.c[15]=1; - -	if (n!=len) return -1; - -	if ((n=len/16)) { -		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); -		n   *= 16; -		inp += n; -		out += n; -		len -= n; -		if (len) ctr64_add(ctx->nonce.c,n/16); -	} - -	if (len) { -		(*block)(ctx->nonce.c,scratch.c,key); -		for (i=0; i<len; ++i) -			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); -		(*block)(ctx->cmac.c,ctx->cmac.c,key); -	} - -	for (i=15-L;i<16;++i) -		ctx->nonce.c[i]=0; - -	(*block)(ctx->nonce.c,scratch.c,key); -	ctx->cmac.u[0] ^= scratch.u[0]; -	ctx->cmac.u[1] ^= scratch.u[1]; - -	ctx->nonce.c[0] = flags0; - -	return 0; +    size_t n; +    unsigned int i, L; +    unsigned char flags0 = ctx->nonce.c[0]; +    block128_f block = ctx->block; +    void *key = ctx->key; +    union { +        u64 u[2]; +        u8 c[16]; +    } scratch; + +    if (!(flags0 & 0x40)) +        (*block) (ctx->nonce.c, ctx->cmac.c, key); + +    ctx->nonce.c[0] = L = flags0 & 7; +    for (n = 0, i = 15 - L; i < 15; ++i) { +        n |= ctx->nonce.c[i]; +        ctx->nonce.c[i] = 0; +        n <<= 8; +    } +    n |= ctx->nonce.c[15];      /* reconstructed length */ +    ctx->nonce.c[15] = 1; + +    if (n != len) +        return -1; + +    if ((n = len / 16)) { +        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c); +        n *= 16; +        inp += n; +        out += n; +        len -= n; +        if (len) +            ctr64_add(ctx->nonce.c, n / 16); +    } + +    if (len) { +        (*block) (ctx->nonce.c, scratch.c, key); +        for (i = 0; i < len; ++i) +            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]); +        (*block) (ctx->cmac.c, ctx->cmac.c, key); +    } + +    for (i = 15 - L; i < 16; ++i) +        ctx->nonce.c[i] = 0; + +    (*block) (ctx->nonce.c, scratch.c, key); +    ctx->cmac.u[0] ^= scratch.u[0]; +    ctx->cmac.u[1] ^= scratch.u[1]; + +    ctx->nonce.c[0] = flags0; + +    return 0;  } -size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len) -{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */ - -	M *= 2; M += 2; -	if (len<M)	return 0; -	memcpy(tag,ctx->cmac.c,M); -	return M; +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ +    unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */ + +    M *= 2; +    M += 2; +    if (len < M) +        return 0; +    memcpy(tag, ctx->cmac.c, M); +    return M;  } diff --git a/openssl/crypto/modes/cfb128.c b/openssl/crypto/modes/cfb128.c index 4e6f5d35e..d4ecbd08e 100644 --- a/openssl/crypto/modes/cfb128.c +++ b/openssl/crypto/modes/cfb128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -59,14 +59,15 @@  #endif  #include <assert.h> -/* The input and output encrypted as though 128bit cfb mode is being - * used.  The extra state information to record how much of the - * 128bit block we have used is contained in *num; +/* + * The input and output encrypted as though 128bit cfb mode is being used. + * The extra state information to record how much of the 128bit block we have + * used is contained in *num;   */  void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block) +                           size_t len, const void *key, +                           unsigned char ivec[16], int *num, +                           int enc, block128_f block)  {      unsigned int n;      size_t l = 0; @@ -77,166 +78,177 @@ void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,      if (enc) {  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do {	/* always true actually */ -		while (n && len) { -			*(out++) = ivec[n] ^= *(in++); -			--len; -			n = (n+1) % 16; -		} -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) -			break; -#endif -		while (len>=16) { -			(*block)(ivec, ivec, key); -			for (; n<16; n+=sizeof(size_t)) { -				*(size_t*)(out+n) = -				*(size_t*)(ivec+n) ^= *(size_t*)(in+n); -			} -			len -= 16; -			out += 16; -			in  += 16; -			n = 0; -		} -		if (len) { -			(*block)(ivec, ivec, key); -			while (len--) { -				out[n] = ivec[n] ^= in[n]; -				++n; -			} -		} -		*num = n; -		return; -	} while (0); -	/* the rest would be commonly eliminated by x86* compiler */ +        if (16 % sizeof(size_t) == 0) { /* always true actually */ +            do { +                while (n && len) { +                    *(out++) = ivec[n] ^= *(in++); +                    --len; +                    n = (n + 1) % 16; +                } +# if defined(STRICT_ALIGNMENT) +                if (((size_t)in | (size_t)out | (size_t)ivec) % +                    sizeof(size_t) != 0) +                    break; +# endif +                while (len >= 16) { +                    (*block) (ivec, ivec, key); +                    for (; n < 16; n += sizeof(size_t)) { +                        *(size_t *)(out + n) = +                            *(size_t *)(ivec + n) ^= *(size_t *)(in + n); +                    } +                    len -= 16; +                    out += 16; +                    in += 16; +                    n = 0; +                } +                if (len) { +                    (*block) (ivec, ivec, key); +                    while (len--) { +                        out[n] = ivec[n] ^= in[n]; +                        ++n; +                    } +                } +                *num = n; +                return; +            } while (0); +        } +        /* the rest would be commonly eliminated by x86* compiler */  #endif -	while (l<len) { -		if (n == 0) { -			(*block)(ivec, ivec, key); -		} -		out[l] = ivec[n] ^= in[l]; -		++l; -		n = (n+1) % 16; -	} -	*num = n; +        while (l < len) { +            if (n == 0) { +                (*block) (ivec, ivec, key); +            } +            out[l] = ivec[n] ^= in[l]; +            ++l; +            n = (n + 1) % 16; +        } +        *num = n;      } else {  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do {	/* always true actually */ -		while (n && len) { -			unsigned char c; -			*(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c; -			--len; -			n = (n+1) % 16; - 		} -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) -			break; -#endif -		while (len>=16) { -			(*block)(ivec, ivec, key); -			for (; n<16; n+=sizeof(size_t)) { -				size_t t = *(size_t*)(in+n); -				*(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t; -				*(size_t*)(ivec+n) = t; -			} -			len -= 16; -			out += 16; -			in  += 16; -			n = 0; -		} -		if (len) { -			(*block)(ivec, ivec, key); -			while (len--) { -				unsigned char c; -				out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c; -				++n; -			} - 		} -		*num = n; -		return; -	} while (0); -	/* the rest would be commonly eliminated by x86* compiler */ +        if (16 % sizeof(size_t) == 0) { /* always true actually */ +            do { +                while (n && len) { +                    unsigned char c; +                    *(out++) = ivec[n] ^ (c = *(in++)); +                    ivec[n] = c; +                    --len; +                    n = (n + 1) % 16; +                } +# if defined(STRICT_ALIGNMENT) +                if (((size_t)in | (size_t)out | (size_t)ivec) % +                    sizeof(size_t) != 0) +                    break; +# endif +                while (len >= 16) { +                    (*block) (ivec, ivec, key); +                    for (; n < 16; n += sizeof(size_t)) { +                        size_t t = *(size_t *)(in + n); +                        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t; +                        *(size_t *)(ivec + n) = t; +                    } +                    len -= 16; +                    out += 16; +                    in += 16; +                    n = 0; +                } +                if (len) { +                    (*block) (ivec, ivec, key); +                    while (len--) { +                        unsigned char c; +                        out[n] = ivec[n] ^ (c = in[n]); +                        ivec[n] = c; +                        ++n; +                    } +                } +                *num = n; +                return; +            } while (0); +        } +        /* the rest would be commonly eliminated by x86* compiler */  #endif -	while (l<len) { -		unsigned char c; -		if (n == 0) { -			(*block)(ivec, ivec, key); -		} -		out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c; -		++l; -		n = (n+1) % 16; -	} -	*num=n; +        while (l < len) { +            unsigned char c; +            if (n == 0) { +                (*block) (ivec, ivec, key); +            } +            out[l] = ivec[n] ^ (c = in[l]); +            ivec[n] = c; +            ++l; +            n = (n + 1) % 16; +        } +        *num = n;      }  } -/* This expects a single block of size nbits for both in and out. Note that -   it corrupts any extra bits in the last byte of out */ -static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out, -			    int nbits,const void *key, -			    unsigned char ivec[16],int enc, -			    block128_f block) +/* + * This expects a single block of size nbits for both in and out. Note that + * it corrupts any extra bits in the last byte of out + */ +static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out, +                               int nbits, const void *key, +                               unsigned char ivec[16], int enc, +                               block128_f block)  { -    int n,rem,num; -    unsigned char ovec[16*2 + 1];  /* +1 because we dererefence (but don't use) one byte off the end */ - -    if (nbits<=0 || nbits>128) return; - -	/* fill in the first half of the new IV with the current IV */ -	memcpy(ovec,ivec,16); -	/* construct the new IV */ -	(*block)(ivec,ivec,key); -	num = (nbits+7)/8; -	if (enc)	/* encrypt the input */ -	    for(n=0 ; n < num ; ++n) -		out[n] = (ovec[16+n] = in[n] ^ ivec[n]); -	else		/* decrypt the input */ -	    for(n=0 ; n < num ; ++n) -		out[n] = (ovec[16+n] = in[n]) ^ ivec[n]; -	/* shift ovec left... */ -	rem = nbits%8; -	num = nbits/8; -	if(rem==0) -	    memcpy(ivec,ovec+num,16); -	else -	    for(n=0 ; n < 16 ; ++n) -		ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem); +    int n, rem, num; +    unsigned char ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't +                                     * use) one byte off the end */ + +    if (nbits <= 0 || nbits > 128) +        return; + +    /* fill in the first half of the new IV with the current IV */ +    memcpy(ovec, ivec, 16); +    /* construct the new IV */ +    (*block) (ivec, ivec, key); +    num = (nbits + 7) / 8; +    if (enc)                    /* encrypt the input */ +        for (n = 0; n < num; ++n) +            out[n] = (ovec[16 + n] = in[n] ^ ivec[n]); +    else                        /* decrypt the input */ +        for (n = 0; n < num; ++n) +            out[n] = (ovec[16 + n] = in[n]) ^ ivec[n]; +    /* shift ovec left... */ +    rem = nbits % 8; +    num = nbits / 8; +    if (rem == 0) +        memcpy(ivec, ovec + num, 16); +    else +        for (n = 0; n < 16; ++n) +            ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);      /* it is not necessary to cleanse ovec, since the IV is not secret */  }  /* N.B. This expects the input to be packed, MS bit first */  void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out, -		 	size_t bits, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block) +                             size_t bits, const void *key, +                             unsigned char ivec[16], int *num, +                             int enc, block128_f block)  {      size_t n; -    unsigned char c[1],d[1]; +    unsigned char c[1], d[1];      assert(in && out && key && ivec && num);      assert(*num == 0); -    for(n=0 ; n<bits ; ++n) -	{ -	c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0; -	cfbr_encrypt_block(c,d,1,key,ivec,enc,block); -	out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) | -		 ((d[0]&0x80) >> (unsigned int)(n%8)); -	} +    for (n = 0; n < bits; ++n) { +        c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0; +        cfbr_encrypt_block(c, d, 1, key, ivec, enc, block); +        out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) | +            ((d[0] & 0x80) >> (unsigned int)(n % 8)); +    }  }  void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, -			size_t length, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block) +                             size_t length, const void *key, +                             unsigned char ivec[16], int *num, +                             int enc, block128_f block)  {      size_t n;      assert(in && out && key && ivec && num);      assert(*num == 0); -    for(n=0 ; n<length ; ++n) -	cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block); +    for (n = 0; n < length; ++n) +        cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);  } - diff --git a/openssl/crypto/modes/ctr128.c b/openssl/crypto/modes/ctr128.c index ee642c586..f3bbcbf72 100644 --- a/openssl/crypto/modes/ctr128.c +++ b/openssl/crypto/modes/ctr128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -59,194 +59,212 @@  #endif  #include <assert.h> -/* NOTE: the IV/counter CTR mode is big-endian.  The code itself - * is endian-neutral. */ +/* + * NOTE: the IV/counter CTR mode is big-endian.  The code itself is + * endian-neutral. + */  /* increment counter (128-bit int) by 1 */ -static void ctr128_inc(unsigned char *counter) { -	u32 n=16; -	u8  c; - -	do { -		--n; -		c = counter[n]; -		++c; -		counter[n] = c; -		if (c) return; -	} while (n); +static void ctr128_inc(unsigned char *counter) +{ +    u32 n = 16; +    u8 c; + +    do { +        --n; +        c = counter[n]; +        ++c; +        counter[n] = c; +        if (c) +            return; +    } while (n);  }  #if !defined(OPENSSL_SMALL_FOOTPRINT) -static void ctr128_inc_aligned(unsigned char *counter) { -	size_t *data,c,n; -	const union { long one; char little; } is_endian = {1}; - -	if (is_endian.little) { -		ctr128_inc(counter); -		return; -	} - -	data = (size_t *)counter; -	n = 16/sizeof(size_t); -	do { -		--n; -		c = data[n]; -		++c; -		data[n] = c; -		if (c) return; -	} while (n); +static void ctr128_inc_aligned(unsigned char *counter) +{ +    size_t *data, c, n; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; + +    if (is_endian.little) { +        ctr128_inc(counter); +        return; +    } + +    data = (size_t *)counter; +    n = 16 / sizeof(size_t); +    do { +        --n; +        c = data[n]; +        ++c; +        data[n] = c; +        if (c) +            return; +    } while (n);  }  #endif -/* The input encrypted as though 128bit counter mode is being - * used.  The extra state information to record how much of the - * 128bit block we have used is contained in *num, and the - * encrypted counter is kept in ecount_buf.  Both *num and - * ecount_buf must be initialised with zeros before the first - * call to CRYPTO_ctr128_encrypt(). - * - * This algorithm assumes that the counter is in the x lower bits - * of the IV (ivec), and that the application has full control over - * overflow and the rest of the IV.  This implementation takes NO - * responsability for checking that the counter doesn't overflow - * into the rest of the IV when incremented. +/* + * The input encrypted as though 128bit counter mode is being used.  The + * extra state information to record how much of the 128bit block we have + * used is contained in *num, and the encrypted counter is kept in + * ecount_buf.  Both *num and ecount_buf must be initialised with zeros + * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes + * that the counter is in the x lower bits of the IV (ivec), and that the + * application has full control over overflow and the rest of the IV.  This + * implementation takes NO responsability for checking that the counter + * doesn't overflow into the rest of the IV when incremented.   */  void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], unsigned char ecount_buf[16], -			unsigned int *num, block128_f block) +                           size_t len, const void *key, +                           unsigned char ivec[16], +                           unsigned char ecount_buf[16], unsigned int *num, +                           block128_f block)  { -	unsigned int n; -	size_t l=0; +    unsigned int n; +    size_t l = 0; -	assert(in && out && key && ecount_buf && num); -	assert(*num < 16); +    assert(in && out && key && ecount_buf && num); +    assert(*num < 16); -	n = *num; +    n = *num;  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do { /* always true actually */ -		while (n && len) { -			*(out++) = *(in++) ^ ecount_buf[n]; -			--len; -			n = (n+1) % 16; -		} - -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) -			break; -#endif -		while (len>=16) { -			(*block)(ivec, ecount_buf, key); -			ctr128_inc_aligned(ivec); -			for (; n<16; n+=sizeof(size_t)) -				*(size_t *)(out+n) = -				*(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n); -			len -= 16; -			out += 16; -			in  += 16; -			n = 0; -		} -		if (len) { -			(*block)(ivec, ecount_buf, key); - 			ctr128_inc_aligned(ivec); -			while (len--) { -				out[n] = in[n] ^ ecount_buf[n]; -				++n; -			} -		} -		*num = n; -		return; -	} while(0); -	/* the rest would be commonly eliminated by x86* compiler */ +    if (16 % sizeof(size_t) == 0) { /* always true actually */ +        do { +            while (n && len) { +                *(out++) = *(in++) ^ ecount_buf[n]; +                --len; +                n = (n + 1) % 16; +            } + +# if defined(STRICT_ALIGNMENT) +            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != +                0) +                break; +# endif +            while (len >= 16) { +                (*block) (ivec, ecount_buf, key); +                ctr128_inc_aligned(ivec); +                for (; n < 16; n += sizeof(size_t)) +                    *(size_t *)(out + n) = +                        *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n); +                len -= 16; +                out += 16; +                in += 16; +                n = 0; +            } +            if (len) { +                (*block) (ivec, ecount_buf, key); +                ctr128_inc_aligned(ivec); +                while (len--) { +                    out[n] = in[n] ^ ecount_buf[n]; +                    ++n; +                } +            } +            *num = n; +            return; +        } while (0); +    } +    /* the rest would be commonly eliminated by x86* compiler */  #endif -	while (l<len) { -		if (n==0) { -			(*block)(ivec, ecount_buf, key); - 			ctr128_inc(ivec); -		} -		out[l] = in[l] ^ ecount_buf[n]; -		++l; -		n = (n+1) % 16; -	} - -	*num=n; +    while (l < len) { +        if (n == 0) { +            (*block) (ivec, ecount_buf, key); +            ctr128_inc(ivec); +        } +        out[l] = in[l] ^ ecount_buf[n]; +        ++l; +        n = (n + 1) % 16; +    } + +    *num = n;  }  /* increment upper 96 bits of 128-bit counter by 1 */ -static void ctr96_inc(unsigned char *counter) { -	u32 n=12; -	u8  c; - -	do { -		--n; -		c = counter[n]; -		++c; -		counter[n] = c; -		if (c) return; -	} while (n); +static void ctr96_inc(unsigned char *counter) +{ +    u32 n = 12; +    u8 c; + +    do { +        --n; +        c = counter[n]; +        ++c; +        counter[n] = c; +        if (c) +            return; +    } while (n);  }  void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], unsigned char ecount_buf[16], -			unsigned int *num, ctr128_f func) +                                 size_t len, const void *key, +                                 unsigned char ivec[16], +                                 unsigned char ecount_buf[16], +                                 unsigned int *num, ctr128_f func)  { -	unsigned int n,ctr32; - -	assert(in && out && key && ecount_buf && num); -	assert(*num < 16); - -	n = *num; - -	while (n && len) { -		*(out++) = *(in++) ^ ecount_buf[n]; -		--len; -		n = (n+1) % 16; -	} - -	ctr32 = GETU32(ivec+12); -	while (len>=16) { -		size_t blocks = len/16; -		/* -		 * 1<<28 is just a not-so-small yet not-so-large number... -		 * Below condition is practically never met, but it has to -		 * be checked for code correctness. -		 */ -		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28)) -			blocks = (1U<<28); -		/* -		 * As (*func) operates on 32-bit counter, caller -		 * has to handle overflow. 'if' below detects the -		 * overflow, which is then handled by limiting the -		 * amount of blocks to the exact overflow point... -		 */ -		ctr32 += (u32)blocks; -		if (ctr32 < blocks) { -			blocks -= ctr32; -			ctr32   = 0; -		} -		(*func)(in,out,blocks,key,ivec); -		/* (*ctr) does not update ivec, caller does: */ -		PUTU32(ivec+12,ctr32); -		/* ... overflow was detected, propogate carry. */ -		if (ctr32 == 0)	ctr96_inc(ivec); -		blocks *= 16; -		len -= blocks; -		out += blocks; -		in  += blocks; -	} -	if (len) { -		memset(ecount_buf,0,16); -		(*func)(ecount_buf,ecount_buf,1,key,ivec); -		++ctr32; -		PUTU32(ivec+12,ctr32); -		if (ctr32 == 0)	ctr96_inc(ivec); -		while (len--) { -			out[n] = in[n] ^ ecount_buf[n]; -			++n; -		} -	} - -	*num=n; +    unsigned int n, ctr32; + +    assert(in && out && key && ecount_buf && num); +    assert(*num < 16); + +    n = *num; + +    while (n && len) { +        *(out++) = *(in++) ^ ecount_buf[n]; +        --len; +        n = (n + 1) % 16; +    } + +    ctr32 = GETU32(ivec + 12); +    while (len >= 16) { +        size_t blocks = len / 16; +        /* +         * 1<<28 is just a not-so-small yet not-so-large number... +         * Below condition is practically never met, but it has to +         * be checked for code correctness. +         */ +        if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) +            blocks = (1U << 28); +        /* +         * As (*func) operates on 32-bit counter, caller +         * has to handle overflow. 'if' below detects the +         * overflow, which is then handled by limiting the +         * amount of blocks to the exact overflow point... +         */ +        ctr32 += (u32)blocks; +        if (ctr32 < blocks) { +            blocks -= ctr32; +            ctr32 = 0; +        } +        (*func) (in, out, blocks, key, ivec); +        /* (*ctr) does not update ivec, caller does: */ +        PUTU32(ivec + 12, ctr32); +        /* ... overflow was detected, propogate carry. */ +        if (ctr32 == 0) +            ctr96_inc(ivec); +        blocks *= 16; +        len -= blocks; +        out += blocks; +        in += blocks; +    } +    if (len) { +        memset(ecount_buf, 0, 16); +        (*func) (ecount_buf, ecount_buf, 1, key, ivec); +        ++ctr32; +        PUTU32(ivec + 12, ctr32); +        if (ctr32 == 0) +            ctr96_inc(ivec); +        while (len--) { +            out[n] = in[n] ^ ecount_buf[n]; +            ++n; +        } +    } + +    *num = n;  } diff --git a/openssl/crypto/modes/cts128.c b/openssl/crypto/modes/cts128.c index 2d583de6f..137be595a 100644 --- a/openssl/crypto/modes/cts128.c +++ b/openssl/crypto/modes/cts128.c @@ -29,425 +29,516 @@   * compliant with the NIST proposal, both extending CBC mode.   */ -size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) -{	size_t residue, n; +size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, +                                   unsigned char *out, size_t len, +                                   const void *key, unsigned char ivec[16], +                                   block128_f block) +{ +    size_t residue, n; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len <= 16) return 0; +    if (len <= 16) +        return 0; -	if ((residue=len%16) == 0) residue = 16; +    if ((residue = len % 16) == 0) +        residue = 16; -	len -= residue; +    len -= residue; -	CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); +    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block); -	in  += len; -	out += len; +    in += len; +    out += len; -	for (n=0; n<residue; ++n) -		ivec[n] ^= in[n]; -	(*block)(ivec,ivec,key); -	memcpy(out,out-16,residue); -	memcpy(out-16,ivec,16);  +    for (n = 0; n < residue; ++n) +        ivec[n] ^= in[n]; +    (*block) (ivec, ivec, key); +    memcpy(out, out - 16, residue); +    memcpy(out - 16, ivec, 16); -	return len+residue; +    return len + residue;  } -size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) -{	size_t residue, n; +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, +                                       unsigned char *out, size_t len, +                                       const void *key, +                                       unsigned char ivec[16], +                                       block128_f block) +{ +    size_t residue, n; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len < 16) return 0; +    if (len < 16) +        return 0; -	residue=len%16; +    residue = len % 16; -	len -= residue; +    len -= residue; -	CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); +    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block); -	if (residue==0)	return len; +    if (residue == 0) +        return len; -	in  += len; -	out += len; +    in += len; +    out += len; -	for (n=0; n<residue; ++n) -		ivec[n] ^= in[n]; -	(*block)(ivec,ivec,key); -	memcpy(out-16+residue,ivec,16); +    for (n = 0; n < residue; ++n) +        ivec[n] ^= in[n]; +    (*block) (ivec, ivec, key); +    memcpy(out - 16 + residue, ivec, 16); -	return len+residue; +    return len + residue;  }  size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc) -{	size_t residue; -	union { size_t align; unsigned char c[16]; } tmp; +                             size_t len, const void *key, +                             unsigned char ivec[16], cbc128_f cbc) +{ +    size_t residue; +    union { +        size_t align; +        unsigned char c[16]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len <= 16) return 0; +    if (len <= 16) +        return 0; -	if ((residue=len%16) == 0) residue = 16; +    if ((residue = len % 16) == 0) +        residue = 16; -	len -= residue; +    len -= residue; -	(*cbc)(in,out,len,key,ivec,1); +    (*cbc) (in, out, len, key, ivec, 1); -	in  += len; -	out += len; +    in += len; +    out += len;  #if defined(CBC_HANDLES_TRUNCATED_IO) -	memcpy(tmp.c,out-16,16); -	(*cbc)(in,out-16,residue,key,ivec,1); -	memcpy(out,tmp.c,residue); +    memcpy(tmp.c, out - 16, 16); +    (*cbc) (in, out - 16, residue, key, ivec, 1); +    memcpy(out, tmp.c, residue);  #else -	memset(tmp.c,0,sizeof(tmp)); -	memcpy(tmp.c,in,residue); -	memcpy(out,out-16,residue); -	(*cbc)(tmp.c,out-16,16,key,ivec,1); +    memset(tmp.c, 0, sizeof(tmp)); +    memcpy(tmp.c, in, residue); +    memcpy(out, out - 16, residue); +    (*cbc) (tmp.c, out - 16, 16, key, ivec, 1);  #endif -	return len+residue; +    return len + residue;  }  size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc) -{	size_t residue; -	union { size_t align; unsigned char c[16]; } tmp; +                                 size_t len, const void *key, +                                 unsigned char ivec[16], cbc128_f cbc) +{ +    size_t residue; +    union { +        size_t align; +        unsigned char c[16]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len < 16) return 0; +    if (len < 16) +        return 0; -	residue=len%16; +    residue = len % 16; -	len -= residue; +    len -= residue; -	(*cbc)(in,out,len,key,ivec,1); +    (*cbc) (in, out, len, key, ivec, 1); -	if (residue==0) return len; +    if (residue == 0) +        return len; -	in  += len; -	out += len; +    in += len; +    out += len;  #if defined(CBC_HANDLES_TRUNCATED_IO) -	(*cbc)(in,out-16+residue,residue,key,ivec,1); +    (*cbc) (in, out - 16 + residue, residue, key, ivec, 1);  #else -	memset(tmp.c,0,sizeof(tmp)); -	memcpy(tmp.c,in,residue); -	(*cbc)(tmp.c,out-16+residue,16,key,ivec,1); +    memset(tmp.c, 0, sizeof(tmp)); +    memcpy(tmp.c, in, residue); +    (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);  #endif -	return len+residue; +    return len + residue;  } -size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) -{	size_t residue, n; -	union { size_t align; unsigned char c[32]; } tmp; +size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, +                                   unsigned char *out, size_t len, +                                   const void *key, unsigned char ivec[16], +                                   block128_f block) +{ +    size_t residue, n; +    union { +        size_t align; +        unsigned char c[32]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len<=16) return 0; +    if (len <= 16) +        return 0; -	if ((residue=len%16) == 0) residue = 16; +    if ((residue = len % 16) == 0) +        residue = 16; -	len -= 16+residue; +    len -= 16 + residue; -	if (len) { -		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); -		in  += len; -		out += len; -	} +    if (len) { +        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); +        in += len; +        out += len; +    } -	(*block)(in,tmp.c+16,key); +    (*block) (in, tmp.c + 16, key); -	memcpy(tmp.c,tmp.c+16,16); -	memcpy(tmp.c,in+16,residue); -	(*block)(tmp.c,tmp.c,key); +    memcpy(tmp.c, tmp.c + 16, 16); +    memcpy(tmp.c, in + 16, residue); +    (*block) (tmp.c, tmp.c, key); -	for(n=0; n<16; ++n) { -		unsigned char c = in[n]; -		out[n] = tmp.c[n] ^ ivec[n]; -		ivec[n] = c; -	} -	for(residue+=16; n<residue; ++n) -		out[n] = tmp.c[n] ^ in[n]; +    for (n = 0; n < 16; ++n) { +        unsigned char c = in[n]; +        out[n] = tmp.c[n] ^ ivec[n]; +        ivec[n] = c; +    } +    for (residue += 16; n < residue; ++n) +        out[n] = tmp.c[n] ^ in[n]; -	return 16+len+residue; +    return 16 + len + residue;  } -size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block) -{	size_t residue, n; -	union { size_t align; unsigned char c[32]; } tmp; +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, +                                       unsigned char *out, size_t len, +                                       const void *key, +                                       unsigned char ivec[16], +                                       block128_f block) +{ +    size_t residue, n; +    union { +        size_t align; +        unsigned char c[32]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len<16) return 0; +    if (len < 16) +        return 0; -	residue=len%16; +    residue = len % 16; -	if (residue==0) { -		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); -		return len; -	} +    if (residue == 0) { +        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); +        return len; +    } -	len -= 16+residue; +    len -= 16 + residue; -	if (len) { -		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); -		in  += len; -		out += len; -	} +    if (len) { +        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); +        in += len; +        out += len; +    } -	(*block)(in+residue,tmp.c+16,key); +    (*block) (in + residue, tmp.c + 16, key); -	memcpy(tmp.c,tmp.c+16,16); -	memcpy(tmp.c,in,residue); -	(*block)(tmp.c,tmp.c,key); +    memcpy(tmp.c, tmp.c + 16, 16); +    memcpy(tmp.c, in, residue); +    (*block) (tmp.c, tmp.c, key); -	for(n=0; n<16; ++n) { -		unsigned char c = in[n]; -		out[n] = tmp.c[n] ^ ivec[n]; -		ivec[n] = in[n+residue]; -		tmp.c[n] = c; -	} -	for(residue+=16; n<residue; ++n) -		out[n] = tmp.c[n] ^ tmp.c[n-16]; +    for (n = 0; n < 16; ++n) { +        unsigned char c = in[n]; +        out[n] = tmp.c[n] ^ ivec[n]; +        ivec[n] = in[n + residue]; +        tmp.c[n] = c; +    } +    for (residue += 16; n < residue; ++n) +        out[n] = tmp.c[n] ^ tmp.c[n - 16]; -	return 16+len+residue; +    return 16 + len + residue;  }  size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc) -{	size_t residue; -	union { size_t align; unsigned char c[32]; } tmp; +                             size_t len, const void *key, +                             unsigned char ivec[16], cbc128_f cbc) +{ +    size_t residue; +    union { +        size_t align; +        unsigned char c[32]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len<=16) return 0; +    if (len <= 16) +        return 0; -	if ((residue=len%16) == 0) residue = 16; +    if ((residue = len % 16) == 0) +        residue = 16; -	len -= 16+residue; +    len -= 16 + residue; -	if (len) { -		(*cbc)(in,out,len,key,ivec,0); -		in  += len; -		out += len; -	} +    if (len) { +        (*cbc) (in, out, len, key, ivec, 0); +        in += len; +        out += len; +    } -	memset(tmp.c,0,sizeof(tmp)); -	/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */ -	(*cbc)(in,tmp.c,16,key,tmp.c+16,0); +    memset(tmp.c, 0, sizeof(tmp)); +    /* +     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] +     */ +    (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0); -	memcpy(tmp.c,in+16,residue); +    memcpy(tmp.c, in + 16, residue);  #if defined(CBC_HANDLES_TRUNCATED_IO) -	(*cbc)(tmp.c,out,16+residue,key,ivec,0); +    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);  #else -	(*cbc)(tmp.c,tmp.c,32,key,ivec,0); -	memcpy(out,tmp.c,16+residue); +    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0); +    memcpy(out, tmp.c, 16 + residue);  #endif -	return 16+len+residue; +    return 16 + len + residue;  }  size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc) -{	size_t residue; -	union { size_t align; unsigned char c[32]; } tmp; +                                 size_t len, const void *key, +                                 unsigned char ivec[16], cbc128_f cbc) +{ +    size_t residue; +    union { +        size_t align; +        unsigned char c[32]; +    } tmp; -	assert (in && out && key && ivec); +    assert(in && out && key && ivec); -	if (len<16) return 0; +    if (len < 16) +        return 0; -	residue=len%16; +    residue = len % 16; -	if (residue==0) { -		(*cbc)(in,out,len,key,ivec,0); -		return len; -	} +    if (residue == 0) { +        (*cbc) (in, out, len, key, ivec, 0); +        return len; +    } -	len -= 16+residue; +    len -= 16 + residue; -	if (len) { -		(*cbc)(in,out,len,key,ivec,0); -		in  += len; -		out += len; -	} +    if (len) { +        (*cbc) (in, out, len, key, ivec, 0); +        in += len; +        out += len; +    } -	memset(tmp.c,0,sizeof(tmp)); -	/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */ -	(*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0); +    memset(tmp.c, 0, sizeof(tmp)); +    /* +     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] +     */ +    (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0); -	memcpy(tmp.c,in,residue); +    memcpy(tmp.c, in, residue);  #if defined(CBC_HANDLES_TRUNCATED_IO) -	(*cbc)(tmp.c,out,16+residue,key,ivec,0); +    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);  #else -	(*cbc)(tmp.c,tmp.c,32,key,ivec,0); -	memcpy(out,tmp.c,16+residue); +    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0); +    memcpy(out, tmp.c, 16 + residue);  #endif -	return 16+len+residue; +    return 16 + len + residue;  }  #if defined(SELFTEST) -#include <stdio.h> -#include <openssl/aes.h> +# include <stdio.h> +# include <openssl/aes.h>  /* test vectors from RFC 3962 */  static const unsigned char test_key[16] = "chicken teriyaki";  static const unsigned char test_input[64] = -		"I would like the" " General Gau's C" -		"hicken, please, " "and wonton soup."; -static const unsigned char test_iv[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -static const unsigned char vector_17[17] = -{0xc6,0x35,0x35,0x68,0xf2,0xbf,0x8c,0xb4, 0xd8,0xa5,0x80,0x36,0x2d,0xa7,0xff,0x7f, - 0x97}; -static const unsigned char vector_31[31] = -{0xfc,0x00,0x78,0x3e,0x0e,0xfd,0xb2,0xc1, 0xd4,0x45,0xd4,0xc8,0xef,0xf7,0xed,0x22, - 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5}; -static const unsigned char vector_32[32] = -{0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8, - 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84}; -static const unsigned char vector_47[47] = -{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84, - 0xb3,0xff,0xfd,0x94,0x0c,0x16,0xa1,0x8c, 0x1b,0x55,0x49,0xd2,0xf8,0x38,0x02,0x9e, - 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5}; -static const unsigned char vector_48[48] = -{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84, - 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8, - 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8}; -static const unsigned char vector_64[64] = -{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84, - 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8, - 0x48,0x07,0xef,0xe8,0x36,0xee,0x89,0xa5, 0x26,0x73,0x0d,0xbc,0x2f,0x7b,0xc8,0x40, - 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8}; +    "I would like the" " General Gau's C" +    "hicken, please, " "and wonton soup."; +static const unsigned char test_iv[16] = +    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + +static const unsigned char vector_17[17] = { +    0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4, +    0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f, +    0x97 +}; + +static const unsigned char vector_31[31] = { +    0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1, +    0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22, +    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, +    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5 +}; + +static const unsigned char vector_32[32] = { +    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, +    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, +    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, +    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84 +}; + +static const unsigned char vector_47[47] = { +    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, +    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, +    0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c, +    0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e, +    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, +    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5 +}; + +static const unsigned char vector_48[48] = { +    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, +    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, +    0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, +    0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8, +    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, +    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8 +}; + +static const unsigned char vector_64[64] = { +    0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, +    0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, +    0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, +    0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, +    0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5, +    0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40, +    0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, +    0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8 +};  static AES_KEY encks, decks; -void test_vector(const unsigned char *vector,size_t len) -{	unsigned char iv[sizeof(test_iv)]; -	unsigned char cleartext[64],ciphertext[64]; -	size_t tail; - -	printf("vector_%d\n",len); fflush(stdout); - -	if ((tail=len%16) == 0) tail = 16; -	tail += 16; - -	/* test block-based encryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_cts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt); -	if (memcmp(ciphertext,vector,len)) -		fprintf(stderr,"output_%d mismatch\n",len), exit(1); -	if (memcmp(iv,vector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(1); - -	/* test block-based decryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_cts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt); -	if (memcmp(cleartext,test_input,len)) -		fprintf(stderr,"input_%d mismatch\n",len), exit(2); -	if (memcmp(iv,vector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(2); - -	/* test streamed encryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_cts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt); -	if (memcmp(ciphertext,vector,len)) -		fprintf(stderr,"output_%d mismatch\n",len), exit(3); -	if (memcmp(iv,vector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(3); - -	/* test streamed decryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_cts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt); -	if (memcmp(cleartext,test_input,len)) -		fprintf(stderr,"input_%d mismatch\n",len), exit(4); -	if (memcmp(iv,vector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(4); +void test_vector(const unsigned char *vector, size_t len) +{ +    unsigned char iv[sizeof(test_iv)]; +    unsigned char cleartext[64], ciphertext[64]; +    size_t tail; + +    printf("vector_%d\n", len); +    fflush(stdout); + +    if ((tail = len % 16) == 0) +        tail = 16; +    tail += 16; + +    /* test block-based encryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_cts128_encrypt_block(test_input, ciphertext, len, &encks, iv, +                                (block128_f) AES_encrypt); +    if (memcmp(ciphertext, vector, len)) +        fprintf(stderr, "output_%d mismatch\n", len), exit(1); +    if (memcmp(iv, vector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(1); + +    /* test block-based decryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_cts128_decrypt_block(ciphertext, cleartext, len, &decks, iv, +                                (block128_f) AES_decrypt); +    if (memcmp(cleartext, test_input, len)) +        fprintf(stderr, "input_%d mismatch\n", len), exit(2); +    if (memcmp(iv, vector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(2); + +    /* test streamed encryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_cts128_encrypt(test_input, ciphertext, len, &encks, iv, +                          (cbc128_f) AES_cbc_encrypt); +    if (memcmp(ciphertext, vector, len)) +        fprintf(stderr, "output_%d mismatch\n", len), exit(3); +    if (memcmp(iv, vector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(3); + +    /* test streamed decryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_cts128_decrypt(ciphertext, cleartext, len, &decks, iv, +                          (cbc128_f) AES_cbc_encrypt); +    if (memcmp(cleartext, test_input, len)) +        fprintf(stderr, "input_%d mismatch\n", len), exit(4); +    if (memcmp(iv, vector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(4);  } -void test_nistvector(const unsigned char *vector,size_t len) -{	unsigned char iv[sizeof(test_iv)]; -	unsigned char cleartext[64],ciphertext[64],nistvector[64]; -	size_t tail; - -	printf("nistvector_%d\n",len); fflush(stdout); - -	if ((tail=len%16) == 0) tail = 16; - -	len -= 16 + tail; -	memcpy(nistvector,vector,len); -	/* flip two last blocks */ -	memcpy(nistvector+len,vector+len+16,tail); -	memcpy(nistvector+len+tail,vector+len,16); -	len += 16 + tail; -	tail = 16; - -	/* test block-based encryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt); -	if (memcmp(ciphertext,nistvector,len)) -		fprintf(stderr,"output_%d mismatch\n",len), exit(1); -	if (memcmp(iv,nistvector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(1); - -	/* test block-based decryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt); -	if (memcmp(cleartext,test_input,len)) -		fprintf(stderr,"input_%d mismatch\n",len), exit(2); -	if (memcmp(iv,nistvector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(2); - -	/* test streamed encryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt); -	if (memcmp(ciphertext,nistvector,len)) -		fprintf(stderr,"output_%d mismatch\n",len), exit(3); -	if (memcmp(iv,nistvector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(3); - -	/* test streamed decryption */ -	memcpy(iv,test_iv,sizeof(test_iv)); -	CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt); -	if (memcmp(cleartext,test_input,len)) -		fprintf(stderr,"input_%d mismatch\n",len), exit(4); -	if (memcmp(iv,nistvector+len-tail,sizeof(iv))) -		fprintf(stderr,"iv_%d mismatch\n",len), exit(4); +void test_nistvector(const unsigned char *vector, size_t len) +{ +    unsigned char iv[sizeof(test_iv)]; +    unsigned char cleartext[64], ciphertext[64], nistvector[64]; +    size_t tail; + +    printf("nistvector_%d\n", len); +    fflush(stdout); + +    if ((tail = len % 16) == 0) +        tail = 16; + +    len -= 16 + tail; +    memcpy(nistvector, vector, len); +    /* flip two last blocks */ +    memcpy(nistvector + len, vector + len + 16, tail); +    memcpy(nistvector + len + tail, vector + len, 16); +    len += 16 + tail; +    tail = 16; + +    /* test block-based encryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_nistcts128_encrypt_block(test_input, ciphertext, len, &encks, iv, +                                    (block128_f) AES_encrypt); +    if (memcmp(ciphertext, nistvector, len)) +        fprintf(stderr, "output_%d mismatch\n", len), exit(1); +    if (memcmp(iv, nistvector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(1); + +    /* test block-based decryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_nistcts128_decrypt_block(ciphertext, cleartext, len, &decks, iv, +                                    (block128_f) AES_decrypt); +    if (memcmp(cleartext, test_input, len)) +        fprintf(stderr, "input_%d mismatch\n", len), exit(2); +    if (memcmp(iv, nistvector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(2); + +    /* test streamed encryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_nistcts128_encrypt(test_input, ciphertext, len, &encks, iv, +                              (cbc128_f) AES_cbc_encrypt); +    if (memcmp(ciphertext, nistvector, len)) +        fprintf(stderr, "output_%d mismatch\n", len), exit(3); +    if (memcmp(iv, nistvector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(3); + +    /* test streamed decryption */ +    memcpy(iv, test_iv, sizeof(test_iv)); +    CRYPTO_nistcts128_decrypt(ciphertext, cleartext, len, &decks, iv, +                              (cbc128_f) AES_cbc_encrypt); +    if (memcmp(cleartext, test_input, len)) +        fprintf(stderr, "input_%d mismatch\n", len), exit(4); +    if (memcmp(iv, nistvector + len - tail, sizeof(iv))) +        fprintf(stderr, "iv_%d mismatch\n", len), exit(4);  }  int main()  { -	AES_set_encrypt_key(test_key,128,&encks); -	AES_set_decrypt_key(test_key,128,&decks); - -	test_vector(vector_17,sizeof(vector_17)); -	test_vector(vector_31,sizeof(vector_31)); -	test_vector(vector_32,sizeof(vector_32)); -	test_vector(vector_47,sizeof(vector_47)); -	test_vector(vector_48,sizeof(vector_48)); -	test_vector(vector_64,sizeof(vector_64)); - -	test_nistvector(vector_17,sizeof(vector_17)); -	test_nistvector(vector_31,sizeof(vector_31)); -	test_nistvector(vector_32,sizeof(vector_32)); -	test_nistvector(vector_47,sizeof(vector_47)); -	test_nistvector(vector_48,sizeof(vector_48)); -	test_nistvector(vector_64,sizeof(vector_64)); - -	return 0; +    AES_set_encrypt_key(test_key, 128, &encks); +    AES_set_decrypt_key(test_key, 128, &decks); + +    test_vector(vector_17, sizeof(vector_17)); +    test_vector(vector_31, sizeof(vector_31)); +    test_vector(vector_32, sizeof(vector_32)); +    test_vector(vector_47, sizeof(vector_47)); +    test_vector(vector_48, sizeof(vector_48)); +    test_vector(vector_64, sizeof(vector_64)); + +    test_nistvector(vector_17, sizeof(vector_17)); +    test_nistvector(vector_31, sizeof(vector_31)); +    test_nistvector(vector_32, sizeof(vector_32)); +    test_nistvector(vector_47, sizeof(vector_47)); +    test_nistvector(vector_48, sizeof(vector_48)); +    test_nistvector(vector_64, sizeof(vector_64)); + +    return 0;  }  #endif diff --git a/openssl/crypto/modes/gcm128.c b/openssl/crypto/modes/gcm128.c index e1dc2b0f4..4debf537f 100644 --- a/openssl/crypto/modes/gcm128.c +++ b/openssl/crypto/modes/gcm128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -62,27 +62,27 @@  #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)  /* redefine, because alignment is ensured */ -#undef	GETU32 -#define	GETU32(p)	BSWAP4(*(const u32 *)(p)) -#undef	PUTU32 -#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v) -#endif - -#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16)) -#define REDUCE1BIT(V)	do { \ -	if (sizeof(size_t)==8) { \ -		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ -		V.lo  = (V.hi<<63)|(V.lo>>1); \ -		V.hi  = (V.hi>>1 )^T; \ -	} \ -	else { \ -		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ -		V.lo  = (V.hi<<63)|(V.lo>>1); \ -		V.hi  = (V.hi>>1 )^((u64)T<<32); \ -	} \ +# undef  GETU32 +# define GETU32(p)       BSWAP4(*(const u32 *)(p)) +# undef  PUTU32 +# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v) +#endif + +#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define REDUCE1BIT(V)   do { \ +        if (sizeof(size_t)==8) { \ +                u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ +                V.lo  = (V.hi<<63)|(V.lo>>1); \ +                V.hi  = (V.hi>>1 )^T; \ +        } \ +        else { \ +                u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ +                V.lo  = (V.hi<<63)|(V.lo>>1); \ +                V.hi  = (V.hi>>1 )^((u64)T<<32); \ +        } \  } while(0) -/* +/*-   * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should   * never be set to 8. 8 is effectively reserved for testing purposes.   * TABLE_BITS>1 are lookup-table-driven implementations referred to as @@ -116,286 +116,311 @@   *   * Value of 1 is not appropriate for performance reasons.   */ -#if	TABLE_BITS==8 +#if     TABLE_BITS==8  static void gcm_init_8bit(u128 Htable[256], u64 H[2])  { -	int  i, j; -	u128 V; - -	Htable[0].hi = 0; -	Htable[0].lo = 0; -	V.hi = H[0]; -	V.lo = H[1]; - -	for (Htable[128]=V, i=64; i>0; i>>=1) { -		REDUCE1BIT(V); -		Htable[i] = V; -	} - -	for (i=2; i<256; i<<=1) { -		u128 *Hi = Htable+i, H0 = *Hi; -		for (j=1; j<i; ++j) { -			Hi[j].hi = H0.hi^Htable[j].hi; -			Hi[j].lo = H0.lo^Htable[j].lo; -		} -	} +    int i, j; +    u128 V; + +    Htable[0].hi = 0; +    Htable[0].lo = 0; +    V.hi = H[0]; +    V.lo = H[1]; + +    for (Htable[128] = V, i = 64; i > 0; i >>= 1) { +        REDUCE1BIT(V); +        Htable[i] = V; +    } + +    for (i = 2; i < 256; i <<= 1) { +        u128 *Hi = Htable + i, H0 = *Hi; +        for (j = 1; j < i; ++j) { +            Hi[j].hi = H0.hi ^ Htable[j].hi; +            Hi[j].lo = H0.lo ^ Htable[j].lo; +        } +    }  }  static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])  { -	u128 Z = { 0, 0}; -	const u8 *xi = (const u8 *)Xi+15; -	size_t rem, n = *xi; -	const union { long one; char little; } is_endian = {1}; -	static const size_t rem_8bit[256] = { -		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), -		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), -		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), -		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), -		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), -		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), -		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), -		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), -		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), -		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), -		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), -		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), -		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), -		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), -		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), -		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), -		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), -		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), -		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), -		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), -		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), -		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), -		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), -		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), -		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), -		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), -		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), -		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), -		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), -		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), -		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), -		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), -		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), -		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), -		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), -		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), -		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), -		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), -		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), -		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), -		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), -		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), -		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), -		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), -		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), -		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), -		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), -		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), -		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), -		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), -		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), -		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), -		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), -		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), -		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), -		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), -		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), -		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), -		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), -		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), -		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), -		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), -		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), -		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; - -	while (1) { -		Z.hi ^= Htable[n].hi; -		Z.lo ^= Htable[n].lo; - -		if ((u8 *)Xi==xi)	break; - -		n = *(--xi); - -		rem  = (size_t)Z.lo&0xff; -		Z.lo = (Z.hi<<56)|(Z.lo>>8); -		Z.hi = (Z.hi>>8); -		if (sizeof(size_t)==8) -			Z.hi ^= rem_8bit[rem]; -		else -			Z.hi ^= (u64)rem_8bit[rem]<<32; -	} - -	if (is_endian.little) { -#ifdef BSWAP8 -		Xi[0] = BSWAP8(Z.hi); -		Xi[1] = BSWAP8(Z.lo); -#else -		u8 *p = (u8 *)Xi; -		u32 v; -		v = (u32)(Z.hi>>32);	PUTU32(p,v); -		v = (u32)(Z.hi);	PUTU32(p+4,v); -		v = (u32)(Z.lo>>32);	PUTU32(p+8,v); -		v = (u32)(Z.lo);	PUTU32(p+12,v); -#endif -	} -	else { -		Xi[0] = Z.hi; -		Xi[1] = Z.lo; -	} +    u128 Z = { 0, 0 }; +    const u8 *xi = (const u8 *)Xi + 15; +    size_t rem, n = *xi; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    static const size_t rem_8bit[256] = { +        PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), +        PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), +        PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), +        PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), +        PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), +        PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), +        PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), +        PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), +        PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), +        PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), +        PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), +        PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), +        PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), +        PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), +        PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), +        PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), +        PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), +        PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), +        PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), +        PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), +        PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), +        PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), +        PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), +        PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), +        PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), +        PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), +        PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), +        PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), +        PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), +        PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), +        PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), +        PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), +        PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), +        PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), +        PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), +        PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), +        PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), +        PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), +        PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), +        PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), +        PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), +        PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), +        PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), +        PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), +        PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), +        PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), +        PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), +        PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), +        PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), +        PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), +        PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), +        PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), +        PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), +        PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), +        PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), +        PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), +        PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), +        PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), +        PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), +        PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), +        PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), +        PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), +        PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), +        PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) +    }; + +    while (1) { +        Z.hi ^= Htable[n].hi; +        Z.lo ^= Htable[n].lo; + +        if ((u8 *)Xi == xi) +            break; + +        n = *(--xi); + +        rem = (size_t)Z.lo & 0xff; +        Z.lo = (Z.hi << 56) | (Z.lo >> 8); +        Z.hi = (Z.hi >> 8); +        if (sizeof(size_t) == 8) +            Z.hi ^= rem_8bit[rem]; +        else +            Z.hi ^= (u64)rem_8bit[rem] << 32; +    } + +    if (is_endian.little) { +# ifdef BSWAP8 +        Xi[0] = BSWAP8(Z.hi); +        Xi[1] = BSWAP8(Z.lo); +# else +        u8 *p = (u8 *)Xi; +        u32 v; +        v = (u32)(Z.hi >> 32); +        PUTU32(p, v); +        v = (u32)(Z.hi); +        PUTU32(p + 4, v); +        v = (u32)(Z.lo >> 32); +        PUTU32(p + 8, v); +        v = (u32)(Z.lo); +        PUTU32(p + 12, v); +# endif +    } else { +        Xi[0] = Z.hi; +        Xi[1] = Z.lo; +    }  } -#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) -#elif	TABLE_BITS==4 +# define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) + +#elif   TABLE_BITS==4  static void gcm_init_4bit(u128 Htable[16], u64 H[2])  { -	u128 V; -#if defined(OPENSSL_SMALL_FOOTPRINT) -	int  i; -#endif +    u128 V; +# if defined(OPENSSL_SMALL_FOOTPRINT) +    int i; +# endif -	Htable[0].hi = 0; -	Htable[0].lo = 0; -	V.hi = H[0]; -	V.lo = H[1]; - -#if defined(OPENSSL_SMALL_FOOTPRINT) -	for (Htable[8]=V, i=4; i>0; i>>=1) { -		REDUCE1BIT(V); -		Htable[i] = V; -	} - -	for (i=2; i<16; i<<=1) { -		u128 *Hi = Htable+i; -		int   j; -		for (V=*Hi, j=1; j<i; ++j) { -			Hi[j].hi = V.hi^Htable[j].hi; -			Hi[j].lo = V.lo^Htable[j].lo; -		} -	} -#else -	Htable[8] = V; -	REDUCE1BIT(V); -	Htable[4] = V; -	REDUCE1BIT(V); -	Htable[2] = V; -	REDUCE1BIT(V); -	Htable[1] = V; -	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo; -	V=Htable[4]; -	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo; -	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo; -	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo; -	V=Htable[8]; -	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo; -	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; -	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; -	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; -	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; -	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; -	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; -#endif -#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) -	/* -	 * ARM assembler expects specific dword order in Htable. -	 */ -	{ -	int j; -	const union { long one; char little; } is_endian = {1}; - -	if (is_endian.little) -		for (j=0;j<16;++j) { -			V = Htable[j]; -			Htable[j].hi = V.lo; -			Htable[j].lo = V.hi; -		} -	else -		for (j=0;j<16;++j) { -			V = Htable[j]; -			Htable[j].hi = V.lo<<32|V.lo>>32; -			Htable[j].lo = V.hi<<32|V.hi>>32; -		} -	} -#endif +    Htable[0].hi = 0; +    Htable[0].lo = 0; +    V.hi = H[0]; +    V.lo = H[1]; + +# if defined(OPENSSL_SMALL_FOOTPRINT) +    for (Htable[8] = V, i = 4; i > 0; i >>= 1) { +        REDUCE1BIT(V); +        Htable[i] = V; +    } + +    for (i = 2; i < 16; i <<= 1) { +        u128 *Hi = Htable + i; +        int j; +        for (V = *Hi, j = 1; j < i; ++j) { +            Hi[j].hi = V.hi ^ Htable[j].hi; +            Hi[j].lo = V.lo ^ Htable[j].lo; +        } +    } +# else +    Htable[8] = V; +    REDUCE1BIT(V); +    Htable[4] = V; +    REDUCE1BIT(V); +    Htable[2] = V; +    REDUCE1BIT(V); +    Htable[1] = V; +    Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; +    V = Htable[4]; +    Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; +    Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; +    Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; +    V = Htable[8]; +    Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; +    Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo; +    Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo; +    Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo; +    Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo; +    Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo; +    Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo; +# endif +# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) +    /* +     * ARM assembler expects specific dword order in Htable. +     */ +    { +        int j; +        const union { +            long one; +            char little; +        } is_endian = { +            1 +        }; + +        if (is_endian.little) +            for (j = 0; j < 16; ++j) { +                V = Htable[j]; +                Htable[j].hi = V.lo; +                Htable[j].lo = V.hi; +        } else +            for (j = 0; j < 16; ++j) { +                V = Htable[j]; +                Htable[j].hi = V.lo << 32 | V.lo >> 32; +                Htable[j].lo = V.hi << 32 | V.hi >> 32; +            } +    } +# endif  } -#ifndef GHASH_ASM +# ifndef GHASH_ASM  static const size_t rem_4bit[16] = { -	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), -	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), -	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), -	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; +    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), +    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), +    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), +    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) +};  static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])  { -	u128 Z; -	int cnt = 15; -	size_t rem, nlo, nhi; -	const union { long one; char little; } is_endian = {1}; - -	nlo  = ((const u8 *)Xi)[15]; -	nhi  = nlo>>4; -	nlo &= 0xf; - -	Z.hi = Htable[nlo].hi; -	Z.lo = Htable[nlo].lo; - -	while (1) { -		rem  = (size_t)Z.lo&0xf; -		Z.lo = (Z.hi<<60)|(Z.lo>>4); -		Z.hi = (Z.hi>>4); -		if (sizeof(size_t)==8) -			Z.hi ^= rem_4bit[rem]; -		else -			Z.hi ^= (u64)rem_4bit[rem]<<32; - -		Z.hi ^= Htable[nhi].hi; -		Z.lo ^= Htable[nhi].lo; - -		if (--cnt<0)		break; - -		nlo  = ((const u8 *)Xi)[cnt]; -		nhi  = nlo>>4; -		nlo &= 0xf; - -		rem  = (size_t)Z.lo&0xf; -		Z.lo = (Z.hi<<60)|(Z.lo>>4); -		Z.hi = (Z.hi>>4); -		if (sizeof(size_t)==8) -			Z.hi ^= rem_4bit[rem]; -		else -			Z.hi ^= (u64)rem_4bit[rem]<<32; - -		Z.hi ^= Htable[nlo].hi; -		Z.lo ^= Htable[nlo].lo; -	} - -	if (is_endian.little) { -#ifdef BSWAP8 -		Xi[0] = BSWAP8(Z.hi); -		Xi[1] = BSWAP8(Z.lo); -#else -		u8 *p = (u8 *)Xi; -		u32 v; -		v = (u32)(Z.hi>>32);	PUTU32(p,v); -		v = (u32)(Z.hi);	PUTU32(p+4,v); -		v = (u32)(Z.lo>>32);	PUTU32(p+8,v); -		v = (u32)(Z.lo);	PUTU32(p+12,v); -#endif -	} -	else { -		Xi[0] = Z.hi; -		Xi[1] = Z.lo; -	} +    u128 Z; +    int cnt = 15; +    size_t rem, nlo, nhi; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; + +    nlo = ((const u8 *)Xi)[15]; +    nhi = nlo >> 4; +    nlo &= 0xf; + +    Z.hi = Htable[nlo].hi; +    Z.lo = Htable[nlo].lo; + +    while (1) { +        rem = (size_t)Z.lo & 0xf; +        Z.lo = (Z.hi << 60) | (Z.lo >> 4); +        Z.hi = (Z.hi >> 4); +        if (sizeof(size_t) == 8) +            Z.hi ^= rem_4bit[rem]; +        else +            Z.hi ^= (u64)rem_4bit[rem] << 32; + +        Z.hi ^= Htable[nhi].hi; +        Z.lo ^= Htable[nhi].lo; + +        if (--cnt < 0) +            break; + +        nlo = ((const u8 *)Xi)[cnt]; +        nhi = nlo >> 4; +        nlo &= 0xf; + +        rem = (size_t)Z.lo & 0xf; +        Z.lo = (Z.hi << 60) | (Z.lo >> 4); +        Z.hi = (Z.hi >> 4); +        if (sizeof(size_t) == 8) +            Z.hi ^= rem_4bit[rem]; +        else +            Z.hi ^= (u64)rem_4bit[rem] << 32; + +        Z.hi ^= Htable[nlo].hi; +        Z.lo ^= Htable[nlo].lo; +    } + +    if (is_endian.little) { +#  ifdef BSWAP8 +        Xi[0] = BSWAP8(Z.hi); +        Xi[1] = BSWAP8(Z.lo); +#  else +        u8 *p = (u8 *)Xi; +        u32 v; +        v = (u32)(Z.hi >> 32); +        PUTU32(p, v); +        v = (u32)(Z.hi); +        PUTU32(p + 4, v); +        v = (u32)(Z.lo >> 32); +        PUTU32(p + 8, v); +        v = (u32)(Z.lo); +        PUTU32(p + 12, v); +#  endif +    } else { +        Xi[0] = Z.hi; +        Xi[1] = Z.lo; +    }  } -#if !defined(OPENSSL_SMALL_FOOTPRINT) +#  if !defined(OPENSSL_SMALL_FOOTPRINT)  /*   * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for   * details... Compiler-generated code doesn't seem to give any @@ -403,1503 +428,1936 @@ static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])   * mostly as reference and a placeholder for possible future   * non-trivial optimization[s]...   */ -static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len) +static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], +                           const u8 *inp, size_t len)  {      u128 Z;      int cnt;      size_t rem, nlo, nhi; -    const union { long one; char little; } is_endian = {1}; - -#if 1 +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; + +#   if 1      do { -	cnt  = 15; -	nlo  = ((const u8 *)Xi)[15]; -	nlo ^= inp[15]; -	nhi  = nlo>>4; -	nlo &= 0xf; - -	Z.hi = Htable[nlo].hi; -	Z.lo = Htable[nlo].lo; - -	while (1) { -		rem  = (size_t)Z.lo&0xf; -		Z.lo = (Z.hi<<60)|(Z.lo>>4); -		Z.hi = (Z.hi>>4); -		if (sizeof(size_t)==8) -			Z.hi ^= rem_4bit[rem]; -		else -			Z.hi ^= (u64)rem_4bit[rem]<<32; - -		Z.hi ^= Htable[nhi].hi; -		Z.lo ^= Htable[nhi].lo; - -		if (--cnt<0)		break; - -		nlo  = ((const u8 *)Xi)[cnt]; -		nlo ^= inp[cnt]; -		nhi  = nlo>>4; -		nlo &= 0xf; - -		rem  = (size_t)Z.lo&0xf; -		Z.lo = (Z.hi<<60)|(Z.lo>>4); -		Z.hi = (Z.hi>>4); -		if (sizeof(size_t)==8) -			Z.hi ^= rem_4bit[rem]; -		else -			Z.hi ^= (u64)rem_4bit[rem]<<32; - -		Z.hi ^= Htable[nlo].hi; -		Z.lo ^= Htable[nlo].lo; -	} -#else +        cnt = 15; +        nlo = ((const u8 *)Xi)[15]; +        nlo ^= inp[15]; +        nhi = nlo >> 4; +        nlo &= 0xf; + +        Z.hi = Htable[nlo].hi; +        Z.lo = Htable[nlo].lo; + +        while (1) { +            rem = (size_t)Z.lo & 0xf; +            Z.lo = (Z.hi << 60) | (Z.lo >> 4); +            Z.hi = (Z.hi >> 4); +            if (sizeof(size_t) == 8) +                Z.hi ^= rem_4bit[rem]; +            else +                Z.hi ^= (u64)rem_4bit[rem] << 32; + +            Z.hi ^= Htable[nhi].hi; +            Z.lo ^= Htable[nhi].lo; + +            if (--cnt < 0) +                break; + +            nlo = ((const u8 *)Xi)[cnt]; +            nlo ^= inp[cnt]; +            nhi = nlo >> 4; +            nlo &= 0xf; + +            rem = (size_t)Z.lo & 0xf; +            Z.lo = (Z.hi << 60) | (Z.lo >> 4); +            Z.hi = (Z.hi >> 4); +            if (sizeof(size_t) == 8) +                Z.hi ^= rem_4bit[rem]; +            else +                Z.hi ^= (u64)rem_4bit[rem] << 32; + +            Z.hi ^= Htable[nlo].hi; +            Z.lo ^= Htable[nlo].lo; +        } +#   else      /*       * Extra 256+16 bytes per-key plus 512 bytes shared tables       * [should] give ~50% improvement... One could have PACK()-ed       * the rem_8bit even here, but the priority is to minimize       * cache footprint... -     */  -    u128 Hshr4[16];	/* Htable shifted right by 4 bits */ -    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */ +     */ +    u128 Hshr4[16];             /* Htable shifted right by 4 bits */ +    u8 Hshl4[16];               /* Htable shifted left by 4 bits */      static const unsigned short rem_8bit[256] = { -	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, -	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, -	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, -	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, -	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, -	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, -	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, -	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, -	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, -	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, -	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, -	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, -	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, -	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, -	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, -	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, -	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, -	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, -	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, -	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, -	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, -	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, -	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, -	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, -	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, -	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, -	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, -	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, -	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, -	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, -	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, -	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; +        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, +        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, +        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, +        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, +        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, +        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, +        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, +        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, +        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, +        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, +        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, +        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, +        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, +        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, +        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, +        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, +        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, +        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, +        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, +        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, +        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, +        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, +        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, +        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, +        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, +        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, +        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, +        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, +        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, +        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, +        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, +        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE +    };      /*       * This pre-processing phase slows down procedure by approximately       * same time as it makes each loop spin faster. In other words       * single block performance is approximately same as straightforward       * "4-bit" implementation, and then it goes only faster...       */ -    for (cnt=0; cnt<16; ++cnt) { -	Z.hi = Htable[cnt].hi; -	Z.lo = Htable[cnt].lo; -	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); -	Hshr4[cnt].hi = (Z.hi>>4); -	Hshl4[cnt]    = (u8)(Z.lo<<4); +    for (cnt = 0; cnt < 16; ++cnt) { +        Z.hi = Htable[cnt].hi; +        Z.lo = Htable[cnt].lo; +        Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4); +        Hshr4[cnt].hi = (Z.hi >> 4); +        Hshl4[cnt] = (u8)(Z.lo << 4);      }      do { -	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { -		nlo  = ((const u8 *)Xi)[cnt]; -		nlo ^= inp[cnt]; -		nhi  = nlo>>4; -		nlo &= 0xf; +        for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) { +            nlo = ((const u8 *)Xi)[cnt]; +            nlo ^= inp[cnt]; +            nhi = nlo >> 4; +            nlo &= 0xf; -		Z.hi ^= Htable[nlo].hi; -		Z.lo ^= Htable[nlo].lo; +            Z.hi ^= Htable[nlo].hi; +            Z.lo ^= Htable[nlo].lo; -		rem = (size_t)Z.lo&0xff; +            rem = (size_t)Z.lo & 0xff; -		Z.lo = (Z.hi<<56)|(Z.lo>>8); -		Z.hi = (Z.hi>>8); +            Z.lo = (Z.hi << 56) | (Z.lo >> 8); +            Z.hi = (Z.hi >> 8); -		Z.hi ^= Hshr4[nhi].hi; -		Z.lo ^= Hshr4[nhi].lo; -		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; -	} +            Z.hi ^= Hshr4[nhi].hi; +            Z.lo ^= Hshr4[nhi].lo; +            Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48; +        } -	nlo  = ((const u8 *)Xi)[0]; -	nlo ^= inp[0]; -	nhi  = nlo>>4; -	nlo &= 0xf; +        nlo = ((const u8 *)Xi)[0]; +        nlo ^= inp[0]; +        nhi = nlo >> 4; +        nlo &= 0xf; -	Z.hi ^= Htable[nlo].hi; -	Z.lo ^= Htable[nlo].lo; +        Z.hi ^= Htable[nlo].hi; +        Z.lo ^= Htable[nlo].lo; -	rem = (size_t)Z.lo&0xf; +        rem = (size_t)Z.lo & 0xf; -	Z.lo = (Z.hi<<60)|(Z.lo>>4); -	Z.hi = (Z.hi>>4); +        Z.lo = (Z.hi << 60) | (Z.lo >> 4); +        Z.hi = (Z.hi >> 4); -	Z.hi ^= Htable[nhi].hi; -	Z.lo ^= Htable[nhi].lo; -	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; -#endif +        Z.hi ^= Htable[nhi].hi; +        Z.lo ^= Htable[nhi].lo; +        Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48; +#   endif -	if (is_endian.little) { -#ifdef BSWAP8 -		Xi[0] = BSWAP8(Z.hi); -		Xi[1] = BSWAP8(Z.lo); -#else -		u8 *p = (u8 *)Xi; -		u32 v; -		v = (u32)(Z.hi>>32);	PUTU32(p,v); -		v = (u32)(Z.hi);	PUTU32(p+4,v); -		v = (u32)(Z.lo>>32);	PUTU32(p+8,v); -		v = (u32)(Z.lo);	PUTU32(p+12,v); -#endif -	} -	else { -		Xi[0] = Z.hi; -		Xi[1] = Z.lo; -	} -    } while (inp+=16, len-=16); +        if (is_endian.little) { +#   ifdef BSWAP8 +            Xi[0] = BSWAP8(Z.hi); +            Xi[1] = BSWAP8(Z.lo); +#   else +            u8 *p = (u8 *)Xi; +            u32 v; +            v = (u32)(Z.hi >> 32); +            PUTU32(p, v); +            v = (u32)(Z.hi); +            PUTU32(p + 4, v); +            v = (u32)(Z.lo >> 32); +            PUTU32(p + 8, v); +            v = (u32)(Z.lo); +            PUTU32(p + 12, v); +#   endif +        } else { +            Xi[0] = Z.hi; +            Xi[1] = Z.lo; +        } +    } while (inp += 16, len -= 16);  } -#endif -#else -void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); -void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); -#endif +#  endif +# else +void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                    size_t len); +# endif -#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) -#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) -#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) -/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache - * trashing effect. In other words idea is to hash data while it's - * still in L1 cache after encryption pass... */ -#define GHASH_CHUNK       (3*1024) -#endif +# define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) +# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) +#  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) +/* + * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing + * effect. In other words idea is to hash data while it's still in L1 cache + * after encryption pass... + */ +#  define GHASH_CHUNK       (3*1024) +# endif -#else	/* TABLE_BITS */ +#else                           /* TABLE_BITS */ -static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) +static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])  { -	u128 V,Z = { 0,0 }; -	long X; -	int  i,j; -	const long *xi = (const long *)Xi; -	const union { long one; char little; } is_endian = {1}; - -	V.hi = H[0];	/* H is in host byte order, no byte swapping */ -	V.lo = H[1]; - -	for (j=0; j<16/sizeof(long); ++j) { -		if (is_endian.little) { -			if (sizeof(long)==8) { -#ifdef BSWAP8 -				X = (long)(BSWAP8(xi[j])); -#else -				const u8 *p = (const u8 *)(xi+j); -				X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); -#endif -			} -			else { -				const u8 *p = (const u8 *)(xi+j); -				X = (long)GETU32(p); -			} -		} -		else -			X = xi[j]; - -		for (i=0; i<8*sizeof(long); ++i, X<<=1) { -			u64 M = (u64)(X>>(8*sizeof(long)-1)); -			Z.hi ^= V.hi&M; -			Z.lo ^= V.lo&M; - -			REDUCE1BIT(V); -		} -	} - -	if (is_endian.little) { -#ifdef BSWAP8 -		Xi[0] = BSWAP8(Z.hi); -		Xi[1] = BSWAP8(Z.lo); -#else -		u8 *p = (u8 *)Xi; -		u32 v; -		v = (u32)(Z.hi>>32);	PUTU32(p,v); -		v = (u32)(Z.hi);	PUTU32(p+4,v); -		v = (u32)(Z.lo>>32);	PUTU32(p+8,v); -		v = (u32)(Z.lo);	PUTU32(p+12,v); -#endif -	} -	else { -		Xi[0] = Z.hi; -		Xi[1] = Z.lo; -	} +    u128 V, Z = { 0, 0 }; +    long X; +    int i, j; +    const long *xi = (const long *)Xi; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; + +    V.hi = H[0];                /* H is in host byte order, no byte swapping */ +    V.lo = H[1]; + +    for (j = 0; j < 16 / sizeof(long); ++j) { +        if (is_endian.little) { +            if (sizeof(long) == 8) { +# ifdef BSWAP8 +                X = (long)(BSWAP8(xi[j])); +# else +                const u8 *p = (const u8 *)(xi + j); +                X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4)); +# endif +            } else { +                const u8 *p = (const u8 *)(xi + j); +                X = (long)GETU32(p); +            } +        } else +            X = xi[j]; + +        for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) { +            u64 M = (u64)(X >> (8 * sizeof(long) - 1)); +            Z.hi ^= V.hi & M; +            Z.lo ^= V.lo & M; + +            REDUCE1BIT(V); +        } +    } + +    if (is_endian.little) { +# ifdef BSWAP8 +        Xi[0] = BSWAP8(Z.hi); +        Xi[1] = BSWAP8(Z.lo); +# else +        u8 *p = (u8 *)Xi; +        u32 v; +        v = (u32)(Z.hi >> 32); +        PUTU32(p, v); +        v = (u32)(Z.hi); +        PUTU32(p + 4, v); +        v = (u32)(Z.lo >> 32); +        PUTU32(p + 8, v); +        v = (u32)(Z.lo); +        PUTU32(p + 12, v); +# endif +    } else { +        Xi[0] = Z.hi; +        Xi[1] = Z.lo; +    }  } -#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) + +# define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)  #endif -#if	TABLE_BITS==4 && defined(GHASH_ASM) -# if	!defined(I386_ONLY) && \ -	(defined(__i386)	|| defined(__i386__)	|| \ -	 defined(__x86_64)	|| defined(__x86_64__)	|| \ -	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64)) +#if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ)) +# if    !defined(I386_ONLY) && \ +        (defined(__i386)        || defined(__i386__)    || \ +         defined(__x86_64)      || defined(__x86_64__)  || \ +         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))  #  define GHASH_ASM_X86_OR_64  #  define GCM_FUNCREF_4BIT  extern unsigned int OPENSSL_ia32cap_P[2]; -void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); -void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); -void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                     size_t len); -#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86) +#  if defined(__i386) || defined(__i386__) || defined(_M_IX86) +#   define gcm_init_avx   gcm_init_clmul +#   define gcm_gmult_avx  gcm_gmult_clmul +#   define gcm_ghash_avx  gcm_ghash_clmul +#  else +void gcm_init_avx(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                   size_t len); +#  endif + +#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)  #   define GHASH_ASM_X86 -void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); -void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                        size_t len); -void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); -void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                        size_t len);  #  endif -# elif defined(__arm__) || defined(__arm) +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)  #  include "arm_arch.h" -#  if __ARM_ARCH__>=7 +#  if __ARM_MAX_ARCH__>=7  #   define GHASH_ASM_ARM  #   define GCM_FUNCREF_4BIT -void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); -void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +#   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL) +#   if defined(__arm__) || defined(__arm) +#    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON) +#   endif +void gcm_init_neon(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                    size_t len); +void gcm_init_v8(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                  size_t len);  #  endif +# elif defined(__sparc__) || defined(__sparc) +#  include "sparc_arch.h" +#  define GHASH_ASM_SPARC +#  define GCM_FUNCREF_4BIT +extern unsigned int OPENSSL_sparcv9cap_P[]; +void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                    size_t len); +# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) +#  include "ppc_arch.h" +#  define GHASH_ASM_PPC +#  define GCM_FUNCREF_4BIT +void gcm_init_p8(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp, +                  size_t len);  # endif  #endif  #ifdef GCM_FUNCREF_4BIT  # undef  GCM_MUL -# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) +# define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)  # ifdef GHASH  #  undef  GHASH -#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) +#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)  # endif  #endif -void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)  { -	const union { long one; char little; } is_endian = {1}; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; -	memset(ctx,0,sizeof(*ctx)); -	ctx->block = block; -	ctx->key   = key; +    memset(ctx, 0, sizeof(*ctx)); +    ctx->block = block; +    ctx->key = key; -	(*block)(ctx->H.c,ctx->H.c,key); +    (*block) (ctx->H.c, ctx->H.c, key); -	if (is_endian.little) { -		/* H is stored in host byte order */ +    if (is_endian.little) { +        /* H is stored in host byte order */  #ifdef BSWAP8 -		ctx->H.u[0] = BSWAP8(ctx->H.u[0]); -		ctx->H.u[1] = BSWAP8(ctx->H.u[1]); +        ctx->H.u[0] = BSWAP8(ctx->H.u[0]); +        ctx->H.u[1] = BSWAP8(ctx->H.u[1]);  #else -		u8 *p = ctx->H.c; -		u64 hi,lo; -		hi = (u64)GETU32(p)  <<32|GETU32(p+4); -		lo = (u64)GETU32(p+8)<<32|GETU32(p+12); -		ctx->H.u[0] = hi; -		ctx->H.u[1] = lo; +        u8 *p = ctx->H.c; +        u64 hi, lo; +        hi = (u64)GETU32(p) << 32 | GETU32(p + 4); +        lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +        ctx->H.u[0] = hi; +        ctx->H.u[1] = lo;  #endif -	} - -#if	TABLE_BITS==8 -	gcm_init_8bit(ctx->Htable,ctx->H.u); -#elif	TABLE_BITS==4 -# if	defined(GHASH_ASM_X86_OR_64) -#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) -	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */ -	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */ -		gcm_init_clmul(ctx->Htable,ctx->H.u); -		ctx->gmult = gcm_gmult_clmul; -		ctx->ghash = gcm_ghash_clmul; -		return; -	} +    } +#if     TABLE_BITS==8 +    gcm_init_8bit(ctx->Htable, ctx->H.u); +#elif   TABLE_BITS==4 +# if    defined(GHASH_ASM_X86_OR_64) +#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) +    if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */ +        OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */ +        if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */ +            gcm_init_avx(ctx->Htable, ctx->H.u); +            ctx->gmult = gcm_gmult_avx; +            ctx->ghash = gcm_ghash_avx; +        } else { +            gcm_init_clmul(ctx->Htable, ctx->H.u); +            ctx->gmult = gcm_gmult_clmul; +            ctx->ghash = gcm_ghash_clmul; +        } +        return; +    }  #  endif -	gcm_init_4bit(ctx->Htable,ctx->H.u); -#  if	defined(GHASH_ASM_X86)			/* x86 only */ -#   if	defined(OPENSSL_IA32_SSE2) -	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */ +    gcm_init_4bit(ctx->Htable, ctx->H.u); +#  if   defined(GHASH_ASM_X86)  /* x86 only */ +#   if  defined(OPENSSL_IA32_SSE2) +    if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */  #   else -	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */ +    if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */  #   endif -		ctx->gmult = gcm_gmult_4bit_mmx; -		ctx->ghash = gcm_ghash_4bit_mmx; -	} else { -		ctx->gmult = gcm_gmult_4bit_x86; -		ctx->ghash = gcm_ghash_4bit_x86; -	} +        ctx->gmult = gcm_gmult_4bit_mmx; +        ctx->ghash = gcm_ghash_4bit_mmx; +    } else { +        ctx->gmult = gcm_gmult_4bit_x86; +        ctx->ghash = gcm_ghash_4bit_x86; +    }  #  else -	ctx->gmult = gcm_gmult_4bit; -	ctx->ghash = gcm_ghash_4bit; +    ctx->gmult = gcm_gmult_4bit; +    ctx->ghash = gcm_ghash_4bit; +#  endif +# elif  defined(GHASH_ASM_ARM) +#  ifdef PMULL_CAPABLE +    if (PMULL_CAPABLE) { +        gcm_init_v8(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_v8; +        ctx->ghash = gcm_ghash_v8; +    } else  #  endif -# elif	defined(GHASH_ASM_ARM) -	if (OPENSSL_armcap_P & ARMV7_NEON) { -		ctx->gmult = gcm_gmult_neon; -		ctx->ghash = gcm_ghash_neon; -	} else { -		gcm_init_4bit(ctx->Htable,ctx->H.u); -		ctx->gmult = gcm_gmult_4bit; -		ctx->ghash = gcm_ghash_4bit; -	} +#  ifdef NEON_CAPABLE +    if (NEON_CAPABLE) { +        gcm_init_neon(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_neon; +        ctx->ghash = gcm_ghash_neon; +    } else +#  endif +    { +        gcm_init_4bit(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_4bit; +        ctx->ghash = gcm_ghash_4bit; +    } +# elif  defined(GHASH_ASM_SPARC) +    if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) { +        gcm_init_vis3(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_vis3; +        ctx->ghash = gcm_ghash_vis3; +    } else { +        gcm_init_4bit(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_4bit; +        ctx->ghash = gcm_ghash_4bit; +    } +# elif  defined(GHASH_ASM_PPC) +    if (OPENSSL_ppccap_P & PPC_CRYPTO207) { +        gcm_init_p8(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_p8; +        ctx->ghash = gcm_ghash_p8; +    } else { +        gcm_init_4bit(ctx->Htable, ctx->H.u); +        ctx->gmult = gcm_gmult_4bit; +        ctx->ghash = gcm_ghash_4bit; +    }  # else -	gcm_init_4bit(ctx->Htable,ctx->H.u); +    gcm_init_4bit(ctx->Htable, ctx->H.u);  # endif  #endif  } -void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, +                         size_t len)  { -	const union { long one; char little; } is_endian = {1}; -	unsigned int ctr; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    unsigned int ctr;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; -#endif - -	ctx->Yi.u[0]  = 0; -	ctx->Yi.u[1]  = 0; -	ctx->Xi.u[0]  = 0; -	ctx->Xi.u[1]  = 0; -	ctx->len.u[0] = 0;	/* AAD length */ -	ctx->len.u[1] = 0;	/* message length */ -	ctx->ares = 0; -	ctx->mres = 0; - -	if (len==12) { -		memcpy(ctx->Yi.c,iv,12); -		ctx->Yi.c[15]=1; -		ctr=1; -	} -	else { -		size_t i; -		u64 len0 = len; - -		while (len>=16) { -			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; -			GCM_MUL(ctx,Yi); -			iv += 16; -			len -= 16; -		} -		if (len) { -			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; -			GCM_MUL(ctx,Yi); -		} -		len0 <<= 3; -		if (is_endian.little) { +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +#endif + +    ctx->Yi.u[0] = 0; +    ctx->Yi.u[1] = 0; +    ctx->Xi.u[0] = 0; +    ctx->Xi.u[1] = 0; +    ctx->len.u[0] = 0;          /* AAD length */ +    ctx->len.u[1] = 0;          /* message length */ +    ctx->ares = 0; +    ctx->mres = 0; + +    if (len == 12) { +        memcpy(ctx->Yi.c, iv, 12); +        ctx->Yi.c[15] = 1; +        ctr = 1; +    } else { +        size_t i; +        u64 len0 = len; + +        while (len >= 16) { +            for (i = 0; i < 16; ++i) +                ctx->Yi.c[i] ^= iv[i]; +            GCM_MUL(ctx, Yi); +            iv += 16; +            len -= 16; +        } +        if (len) { +            for (i = 0; i < len; ++i) +                ctx->Yi.c[i] ^= iv[i]; +            GCM_MUL(ctx, Yi); +        } +        len0 <<= 3; +        if (is_endian.little) {  #ifdef BSWAP8 -			ctx->Yi.u[1]  ^= BSWAP8(len0); +            ctx->Yi.u[1] ^= BSWAP8(len0);  #else -			ctx->Yi.c[8]  ^= (u8)(len0>>56); -			ctx->Yi.c[9]  ^= (u8)(len0>>48); -			ctx->Yi.c[10] ^= (u8)(len0>>40); -			ctx->Yi.c[11] ^= (u8)(len0>>32); -			ctx->Yi.c[12] ^= (u8)(len0>>24); -			ctx->Yi.c[13] ^= (u8)(len0>>16); -			ctx->Yi.c[14] ^= (u8)(len0>>8); -			ctx->Yi.c[15] ^= (u8)(len0); +            ctx->Yi.c[8] ^= (u8)(len0 >> 56); +            ctx->Yi.c[9] ^= (u8)(len0 >> 48); +            ctx->Yi.c[10] ^= (u8)(len0 >> 40); +            ctx->Yi.c[11] ^= (u8)(len0 >> 32); +            ctx->Yi.c[12] ^= (u8)(len0 >> 24); +            ctx->Yi.c[13] ^= (u8)(len0 >> 16); +            ctx->Yi.c[14] ^= (u8)(len0 >> 8); +            ctx->Yi.c[15] ^= (u8)(len0);  #endif -		} -		else -			ctx->Yi.u[1]  ^= len0; +        } else +            ctx->Yi.u[1] ^= len0; -		GCM_MUL(ctx,Yi); +        GCM_MUL(ctx, Yi); -		if (is_endian.little) +        if (is_endian.little)  #ifdef BSWAP4 -			ctr = BSWAP4(ctx->Yi.d[3]); +            ctr = BSWAP4(ctx->Yi.d[3]);  #else -			ctr = GETU32(ctx->Yi.c+12); +            ctr = GETU32(ctx->Yi.c + 12);  #endif -		else -			ctr = ctx->Yi.d[3]; -	} +        else +            ctr = ctx->Yi.d[3]; +    } -	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); -	++ctr; -	if (is_endian.little) +    (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key); +    ++ctr; +    if (is_endian.little)  #ifdef BSWAP4 -		ctx->Yi.d[3] = BSWAP4(ctr); +        ctx->Yi.d[3] = BSWAP4(ctr);  #else -		PUTU32(ctx->Yi.c+12,ctr); +        PUTU32(ctx->Yi.c + 12, ctr);  #endif -	else -		ctx->Yi.d[3] = ctr; +    else +        ctx->Yi.d[3] = ctr;  } -int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, +                      size_t len)  { -	size_t i; -	unsigned int n; -	u64 alen = ctx->len.u[0]; +    size_t i; +    unsigned int n; +    u64 alen = ctx->len.u[0];  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  # ifdef GHASH -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx->ghash; +    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                         const u8 *inp, size_t len) = ctx->ghash;  # endif  #endif -	if (ctx->len.u[1]) return -2; - -	alen += len; -	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) -		return -1; -	ctx->len.u[0] = alen; - -	n = ctx->ares; -	if (n) { -		while (n && len) { -			ctx->Xi.c[n] ^= *(aad++); -			--len; -			n = (n+1)%16; -		} -		if (n==0) GCM_MUL(ctx,Xi); -		else { -			ctx->ares = n; -			return 0; -		} -	} - +    if (ctx->len.u[1]) +        return -2; + +    alen += len; +    if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len)) +        return -1; +    ctx->len.u[0] = alen; + +    n = ctx->ares; +    if (n) { +        while (n && len) { +            ctx->Xi.c[n] ^= *(aad++); +            --len; +            n = (n + 1) % 16; +        } +        if (n == 0) +            GCM_MUL(ctx, Xi); +        else { +            ctx->ares = n; +            return 0; +        } +    }  #ifdef GHASH -	if ((i = (len&(size_t)-16))) { -		GHASH(ctx,aad,i); -		aad += i; -		len -= i; -	} +    if ((i = (len & (size_t)-16))) { +        GHASH(ctx, aad, i); +        aad += i; +        len -= i; +    }  #else -	while (len>=16) { -		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; -		GCM_MUL(ctx,Xi); -		aad += 16; -		len -= 16; -	} +    while (len >= 16) { +        for (i = 0; i < 16; ++i) +            ctx->Xi.c[i] ^= aad[i]; +        GCM_MUL(ctx, Xi); +        aad += 16; +        len -= 16; +    }  #endif -	if (len) { -		n = (unsigned int)len; -		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; -	} +    if (len) { +        n = (unsigned int)len; +        for (i = 0; i < len; ++i) +            ctx->Xi.c[i] ^= aad[i]; +    } -	ctx->ares = n; -	return 0; +    ctx->ares = n; +    return 0;  }  int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, -		const unsigned char *in, unsigned char *out, -		size_t len) +                          const unsigned char *in, unsigned char *out, +                          size_t len)  { -	const union { long one; char little; } is_endian = {1}; -	unsigned int n, ctr; -	size_t i; -	u64        mlen  = ctx->len.u[1]; -	block128_f block = ctx->block; -	void      *key   = ctx->key; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    unsigned int n, ctr; +    size_t i; +    u64 mlen = ctx->len.u[1]; +    block128_f block = ctx->block; +    void *key = ctx->key;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  # ifdef GHASH -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx->ghash; +    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                         const u8 *inp, size_t len) = ctx->ghash;  # endif  #endif  #if 0 -	n = (unsigned int)mlen%16; /* alternative to ctx->mres */ +    n = (unsigned int)mlen % 16; /* alternative to ctx->mres */  #endif -	mlen += len; -	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) -		return -1; -	ctx->len.u[1] = mlen; - -	if (ctx->ares) { -		/* First call to encrypt finalizes GHASH(AAD) */ -		GCM_MUL(ctx,Xi); -		ctx->ares = 0; -	} - -	if (is_endian.little) -#ifdef BSWAP4 -		ctr = BSWAP4(ctx->Yi.d[3]); -#else -		ctr = GETU32(ctx->Yi.c+12); -#endif -	else -		ctr = ctx->Yi.d[3]; +    mlen += len; +    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) +        return -1; +    ctx->len.u[1] = mlen; -	n = ctx->mres; -#if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do {	/* always true actually */ -		if (n) { -			while (n && len) { -				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; -				--len; -				n = (n+1)%16; -			} -			if (n==0) GCM_MUL(ctx,Xi); -			else { -				ctx->mres = n; -				return 0; -			} -		} -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) -			break; -#endif -#if defined(GHASH) && defined(GHASH_CHUNK) -		while (len>=GHASH_CHUNK) { -		    size_t j=GHASH_CHUNK; - -		    while (j) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; +    if (ctx->ares) { +        /* First call to encrypt finalizes GHASH(AAD) */ +        GCM_MUL(ctx, Xi); +        ctx->ares = 0; +    } -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) +    if (is_endian.little)  #ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); +        ctr = BSWAP4(ctx->Yi.d[3]);  #else -				PUTU32(ctx->Yi.c+12,ctr); +        ctr = GETU32(ctx->Yi.c + 12);  #endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) -				out_t[i] = in_t[i] ^ ctx->EKi.t[i]; -			out += 16; -			in  += 16; -			j   -= 16; -		    } -		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); -		    len -= GHASH_CHUNK; -		} -		if ((i = (len&(size_t)-16))) { -		    size_t j=i; - -		    while (len>=16) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; - -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) -				out_t[i] = in_t[i] ^ ctx->EKi.t[i]; -			out += 16; -			in  += 16; -			len -= 16; -		    } -		    GHASH(ctx,out-j,j); -		} -#else -		while (len>=16) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; +    else +        ctr = ctx->Yi.d[3]; -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) -				ctx->Xi.t[i] ^= -				out_t[i] = in_t[i]^ctx->EKi.t[i]; -			GCM_MUL(ctx,Xi); -			out += 16; -			in  += 16; -			len -= 16; -		} -#endif -		if (len) { -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			while (len--) { -				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; -				++n; -			} -		} - -		ctx->mres = n; -		return 0; -	} while(0); +    n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) +    if (16 % sizeof(size_t) == 0) { /* always true actually */ +        do { +            if (n) { +                while (n && len) { +                    ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; +                    --len; +                    n = (n + 1) % 16; +                } +                if (n == 0) +                    GCM_MUL(ctx, Xi); +                else { +                    ctx->mres = n; +                    return 0; +                } +            } +# if defined(STRICT_ALIGNMENT) +            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0) +                break; +# endif +# if defined(GHASH) && defined(GHASH_CHUNK) +            while (len >= GHASH_CHUNK) { +                size_t j = GHASH_CHUNK; + +                while (j) { +                    size_t *out_t = (size_t *)out; +                    const size_t *in_t = (const size_t *)in; + +                    (*block) (ctx->Yi.c, ctx->EKi.c, key); +                    ++ctr; +                    if (is_endian.little) +#  ifdef BSWAP4 +                        ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                        PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                    else +                        ctx->Yi.d[3] = ctr; +                    for (i = 0; i < 16 / sizeof(size_t); ++i) +                        out_t[i] = in_t[i] ^ ctx->EKi.t[i]; +                    out += 16; +                    in += 16; +                    j -= 16; +                } +                GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); +                len -= GHASH_CHUNK; +            } +            if ((i = (len & (size_t)-16))) { +                size_t j = i; + +                while (len >= 16) { +                    size_t *out_t = (size_t *)out; +                    const size_t *in_t = (const size_t *)in; + +                    (*block) (ctx->Yi.c, ctx->EKi.c, key); +                    ++ctr; +                    if (is_endian.little) +#  ifdef BSWAP4 +                        ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                        PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                    else +                        ctx->Yi.d[3] = ctr; +                    for (i = 0; i < 16 / sizeof(size_t); ++i) +                        out_t[i] = in_t[i] ^ ctx->EKi.t[i]; +                    out += 16; +                    in += 16; +                    len -= 16; +                } +                GHASH(ctx, out - j, j); +            } +# else +            while (len >= 16) { +                size_t *out_t = (size_t *)out; +                const size_t *in_t = (const size_t *)in; + +                (*block) (ctx->Yi.c, ctx->EKi.c, key); +                ++ctr; +                if (is_endian.little) +#  ifdef BSWAP4 +                    ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                    PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                else +                    ctx->Yi.d[3] = ctr; +                for (i = 0; i < 16 / sizeof(size_t); ++i) +                    ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i]; +                GCM_MUL(ctx, Xi); +                out += 16; +                in += 16; +                len -= 16; +            } +# endif +            if (len) { +                (*block) (ctx->Yi.c, ctx->EKi.c, key); +                ++ctr; +                if (is_endian.little) +# ifdef BSWAP4 +                    ctx->Yi.d[3] = BSWAP4(ctr); +# else +                    PUTU32(ctx->Yi.c + 12, ctr); +# endif +                else +                    ctx->Yi.d[3] = ctr; +                while (len--) { +                    ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; +                    ++n; +                } +            } + +            ctx->mres = n; +            return 0; +        } while (0); +    }  #endif -	for (i=0;i<len;++i) { -		if (n==0) { -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) +    for (i = 0; i < len; ++i) { +        if (n == 0) { +            (*block) (ctx->Yi.c, ctx->EKi.c, key); +            ++ctr; +            if (is_endian.little)  #ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); +                ctx->Yi.d[3] = BSWAP4(ctr);  #else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -		} -		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; -		n = (n+1)%16; -		if (n==0) -			GCM_MUL(ctx,Xi); -	} - -	ctx->mres = n; -	return 0; +                PUTU32(ctx->Yi.c + 12, ctr); +#endif +            else +                ctx->Yi.d[3] = ctr; +        } +        ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n]; +        n = (n + 1) % 16; +        if (n == 0) +            GCM_MUL(ctx, Xi); +    } + +    ctx->mres = n; +    return 0;  }  int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, -		const unsigned char *in, unsigned char *out, -		size_t len) +                          const unsigned char *in, unsigned char *out, +                          size_t len)  { -	const union { long one; char little; } is_endian = {1}; -	unsigned int n, ctr; -	size_t i; -	u64        mlen  = ctx->len.u[1]; -	block128_f block = ctx->block; -	void      *key   = ctx->key; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    unsigned int n, ctr; +    size_t i; +    u64 mlen = ctx->len.u[1]; +    block128_f block = ctx->block; +    void *key = ctx->key;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  # ifdef GHASH -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx->ghash; +    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                         const u8 *inp, size_t len) = ctx->ghash;  # endif  #endif -	mlen += len; -	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) -		return -1; -	ctx->len.u[1] = mlen; +    mlen += len; +    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) +        return -1; +    ctx->len.u[1] = mlen; -	if (ctx->ares) { -		/* First call to decrypt finalizes GHASH(AAD) */ -		GCM_MUL(ctx,Xi); -		ctx->ares = 0; -	} +    if (ctx->ares) { +        /* First call to decrypt finalizes GHASH(AAD) */ +        GCM_MUL(ctx, Xi); +        ctx->ares = 0; +    } -	if (is_endian.little) +    if (is_endian.little)  #ifdef BSWAP4 -		ctr = BSWAP4(ctx->Yi.d[3]); +        ctr = BSWAP4(ctx->Yi.d[3]);  #else -		ctr = GETU32(ctx->Yi.c+12); +        ctr = GETU32(ctx->Yi.c + 12);  #endif -	else -		ctr = ctx->Yi.d[3]; +    else +        ctr = ctx->Yi.d[3]; -	n = ctx->mres; +    n = ctx->mres;  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do {	/* always true actually */ -		if (n) { -			while (n && len) { -				u8 c = *(in++); -				*(out++) = c^ctx->EKi.c[n]; -				ctx->Xi.c[n] ^= c; -				--len; -				n = (n+1)%16; -			} -			if (n==0) GCM_MUL (ctx,Xi); -			else { -				ctx->mres = n; -				return 0; -			} -		} -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) -			break; -#endif -#if defined(GHASH) && defined(GHASH_CHUNK) -		while (len>=GHASH_CHUNK) { -		    size_t j=GHASH_CHUNK; - -		    GHASH(ctx,in,GHASH_CHUNK); -		    while (j) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; - -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); +    if (16 % sizeof(size_t) == 0) { /* always true actually */ +        do { +            if (n) { +                while (n && len) { +                    u8 c = *(in++); +                    *(out++) = c ^ ctx->EKi.c[n]; +                    ctx->Xi.c[n] ^= c; +                    --len; +                    n = (n + 1) % 16; +                } +                if (n == 0) +                    GCM_MUL(ctx, Xi); +                else { +                    ctx->mres = n; +                    return 0; +                } +            } +# if defined(STRICT_ALIGNMENT) +            if (((size_t)in | (size_t)out) % sizeof(size_t) != 0) +                break; +# endif +# if defined(GHASH) && defined(GHASH_CHUNK) +            while (len >= GHASH_CHUNK) { +                size_t j = GHASH_CHUNK; + +                GHASH(ctx, in, GHASH_CHUNK); +                while (j) { +                    size_t *out_t = (size_t *)out; +                    const size_t *in_t = (const size_t *)in; + +                    (*block) (ctx->Yi.c, ctx->EKi.c, key); +                    ++ctr; +                    if (is_endian.little) +#  ifdef BSWAP4 +                        ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                        PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                    else +                        ctx->Yi.d[3] = ctr; +                    for (i = 0; i < 16 / sizeof(size_t); ++i) +                        out_t[i] = in_t[i] ^ ctx->EKi.t[i]; +                    out += 16; +                    in += 16; +                    j -= 16; +                } +                len -= GHASH_CHUNK; +            } +            if ((i = (len & (size_t)-16))) { +                GHASH(ctx, in, i); +                while (len >= 16) { +                    size_t *out_t = (size_t *)out; +                    const size_t *in_t = (const size_t *)in; + +                    (*block) (ctx->Yi.c, ctx->EKi.c, key); +                    ++ctr; +                    if (is_endian.little) +#  ifdef BSWAP4 +                        ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                        PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                    else +                        ctx->Yi.d[3] = ctr; +                    for (i = 0; i < 16 / sizeof(size_t); ++i) +                        out_t[i] = in_t[i] ^ ctx->EKi.t[i]; +                    out += 16; +                    in += 16; +                    len -= 16; +                } +            } +# else +            while (len >= 16) { +                size_t *out_t = (size_t *)out; +                const size_t *in_t = (const size_t *)in; + +                (*block) (ctx->Yi.c, ctx->EKi.c, key); +                ++ctr; +                if (is_endian.little) +#  ifdef BSWAP4 +                    ctx->Yi.d[3] = BSWAP4(ctr); +#  else +                    PUTU32(ctx->Yi.c + 12, ctr); +#  endif +                else +                    ctx->Yi.d[3] = ctr; +                for (i = 0; i < 16 / sizeof(size_t); ++i) { +                    size_t c = in[i]; +                    out[i] = c ^ ctx->EKi.t[i]; +                    ctx->Xi.t[i] ^= c; +                } +                GCM_MUL(ctx, Xi); +                out += 16; +                in += 16; +                len -= 16; +            } +# endif +            if (len) { +                (*block) (ctx->Yi.c, ctx->EKi.c, key); +                ++ctr; +                if (is_endian.little) +# ifdef BSWAP4 +                    ctx->Yi.d[3] = BSWAP4(ctr); +# else +                    PUTU32(ctx->Yi.c + 12, ctr); +# endif +                else +                    ctx->Yi.d[3] = ctr; +                while (len--) { +                    u8 c = in[n]; +                    ctx->Xi.c[n] ^= c; +                    out[n] = c ^ ctx->EKi.c[n]; +                    ++n; +                } +            } + +            ctx->mres = n; +            return 0; +        } while (0); +    }  #endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) -				out_t[i] = in_t[i]^ctx->EKi.t[i]; -			out += 16; -			in  += 16; -			j   -= 16; -		    } -		    len -= GHASH_CHUNK; -		} -		if ((i = (len&(size_t)-16))) { -		    GHASH(ctx,in,i); -		    while (len>=16) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; - -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) +    for (i = 0; i < len; ++i) { +        u8 c; +        if (n == 0) { +            (*block) (ctx->Yi.c, ctx->EKi.c, key); +            ++ctr; +            if (is_endian.little)  #ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); +                ctx->Yi.d[3] = BSWAP4(ctr);  #else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) -				out_t[i] = in_t[i]^ctx->EKi.t[i]; -			out += 16; -			in  += 16; -			len -= 16; -		    } -		} -#else -		while (len>=16) { -		    	size_t *out_t=(size_t *)out; -		    	const size_t *in_t=(const size_t *)in; +                PUTU32(ctx->Yi.c + 12, ctr); +#endif +            else +                ctx->Yi.d[3] = ctr; +        } +        c = in[i]; +        out[i] = c ^ ctx->EKi.c[n]; +        ctx->Xi.c[n] ^= c; +        n = (n + 1) % 16; +        if (n == 0) +            GCM_MUL(ctx, Xi); +    } -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			for (i=0; i<16/sizeof(size_t); ++i) { -				size_t c = in[i]; -				out[i] = c^ctx->EKi.t[i]; -				ctx->Xi.t[i] ^= c; -			} -			GCM_MUL(ctx,Xi); -			out += 16; -			in  += 16; -			len -= 16; -		} -#endif -		if (len) { -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -			while (len--) { -				u8 c = in[n]; -				ctx->Xi.c[n] ^= c; -				out[n] = c^ctx->EKi.c[n]; -				++n; -			} -		} - -		ctx->mres = n; -		return 0; -	} while(0); -#endif -	for (i=0;i<len;++i) { -		u8 c; -		if (n==0) { -			(*block)(ctx->Yi.c,ctx->EKi.c,key); -			++ctr; -			if (is_endian.little) -#ifdef BSWAP4 -				ctx->Yi.d[3] = BSWAP4(ctr); -#else -				PUTU32(ctx->Yi.c+12,ctr); -#endif -			else -				ctx->Yi.d[3] = ctr; -		} -		c = in[i]; -		out[i] = c^ctx->EKi.c[n]; -		ctx->Xi.c[n] ^= c; -		n = (n+1)%16; -		if (n==0) -			GCM_MUL(ctx,Xi); -	} - -	ctx->mres = n; -	return 0; +    ctx->mres = n; +    return 0;  }  int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, -		const unsigned char *in, unsigned char *out, -		size_t len, ctr128_f stream) +                                const unsigned char *in, unsigned char *out, +                                size_t len, ctr128_f stream)  { -	const union { long one; char little; } is_endian = {1}; -	unsigned int n, ctr; -	size_t i; -	u64   mlen = ctx->len.u[1]; -	void *key  = ctx->key; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    unsigned int n, ctr; +    size_t i; +    u64 mlen = ctx->len.u[1]; +    void *key = ctx->key;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  # ifdef GHASH -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx->ghash; +    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                         const u8 *inp, size_t len) = ctx->ghash;  # endif  #endif -	mlen += len; -	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) -		return -1; -	ctx->len.u[1] = mlen; +    mlen += len; +    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) +        return -1; +    ctx->len.u[1] = mlen; -	if (ctx->ares) { -		/* First call to encrypt finalizes GHASH(AAD) */ -		GCM_MUL(ctx,Xi); -		ctx->ares = 0; -	} +    if (ctx->ares) { +        /* First call to encrypt finalizes GHASH(AAD) */ +        GCM_MUL(ctx, Xi); +        ctx->ares = 0; +    } -	if (is_endian.little) +    if (is_endian.little)  #ifdef BSWAP4 -		ctr = BSWAP4(ctx->Yi.d[3]); +        ctr = BSWAP4(ctx->Yi.d[3]);  #else -		ctr = GETU32(ctx->Yi.c+12); -#endif -	else -		ctr = ctx->Yi.d[3]; - -	n = ctx->mres; -	if (n) { -		while (n && len) { -			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; -			--len; -			n = (n+1)%16; -		} -		if (n==0) GCM_MUL(ctx,Xi); -		else { -			ctx->mres = n; -			return 0; -		} -	} +        ctr = GETU32(ctx->Yi.c + 12); +#endif +    else +        ctr = ctx->Yi.d[3]; + +    n = ctx->mres; +    if (n) { +        while (n && len) { +            ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; +            --len; +            n = (n + 1) % 16; +        } +        if (n == 0) +            GCM_MUL(ctx, Xi); +        else { +            ctx->mres = n; +            return 0; +        } +    }  #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) -	while (len>=GHASH_CHUNK) { -		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); -		ctr += GHASH_CHUNK/16; -		if (is_endian.little) -#ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); -#else -			PUTU32(ctx->Yi.c+12,ctr); -#endif -		else -			ctx->Yi.d[3] = ctr; -		GHASH(ctx,out,GHASH_CHUNK); -		out += GHASH_CHUNK; -		in  += GHASH_CHUNK; -		len -= GHASH_CHUNK; -	} +    while (len >= GHASH_CHUNK) { +        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); +        ctr += GHASH_CHUNK / 16; +        if (is_endian.little) +# ifdef BSWAP4 +            ctx->Yi.d[3] = BSWAP4(ctr); +# else +            PUTU32(ctx->Yi.c + 12, ctr); +# endif +        else +            ctx->Yi.d[3] = ctr; +        GHASH(ctx, out, GHASH_CHUNK); +        out += GHASH_CHUNK; +        in += GHASH_CHUNK; +        len -= GHASH_CHUNK; +    }  #endif -	if ((i = (len&(size_t)-16))) { -		size_t j=i/16; +    if ((i = (len & (size_t)-16))) { +        size_t j = i / 16; -		(*stream)(in,out,j,key,ctx->Yi.c); -		ctr += (unsigned int)j; -		if (is_endian.little) +        (*stream) (in, out, j, key, ctx->Yi.c); +        ctr += (unsigned int)j; +        if (is_endian.little)  #ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); +            ctx->Yi.d[3] = BSWAP4(ctr);  #else -			PUTU32(ctx->Yi.c+12,ctr); +            PUTU32(ctx->Yi.c + 12, ctr);  #endif -		else -			ctx->Yi.d[3] = ctr; -		in  += i; -		len -= i; +        else +            ctx->Yi.d[3] = ctr; +        in += i; +        len -= i;  #if defined(GHASH) -		GHASH(ctx,out,i); -		out += i; +        GHASH(ctx, out, i); +        out += i;  #else -		while (j--) { -			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; -			GCM_MUL(ctx,Xi); -			out += 16; -		} +        while (j--) { +            for (i = 0; i < 16; ++i) +                ctx->Xi.c[i] ^= out[i]; +            GCM_MUL(ctx, Xi); +            out += 16; +        }  #endif -	} -	if (len) { -		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); -		++ctr; -		if (is_endian.little) +    } +    if (len) { +        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key); +        ++ctr; +        if (is_endian.little)  #ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); +            ctx->Yi.d[3] = BSWAP4(ctr);  #else -			PUTU32(ctx->Yi.c+12,ctr); -#endif -		else -			ctx->Yi.d[3] = ctr; -		while (len--) { -			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; -			++n; -		} -	} - -	ctx->mres = n; -	return 0; +            PUTU32(ctx->Yi.c + 12, ctr); +#endif +        else +            ctx->Yi.d[3] = ctr; +        while (len--) { +            ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; +            ++n; +        } +    } + +    ctx->mres = n; +    return 0;  }  int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, -		const unsigned char *in, unsigned char *out, -		size_t len,ctr128_f stream) +                                const unsigned char *in, unsigned char *out, +                                size_t len, ctr128_f stream)  { -	const union { long one; char little; } is_endian = {1}; -	unsigned int n, ctr; -	size_t i; -	u64   mlen = ctx->len.u[1]; -	void *key  = ctx->key; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    unsigned int n, ctr; +    size_t i; +    u64 mlen = ctx->len.u[1]; +    void *key = ctx->key;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  # ifdef GHASH -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx->ghash; +    void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                         const u8 *inp, size_t len) = ctx->ghash;  # endif  #endif -	mlen += len; -	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) -		return -1; -	ctx->len.u[1] = mlen; +    mlen += len; +    if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) +        return -1; +    ctx->len.u[1] = mlen; -	if (ctx->ares) { -		/* First call to decrypt finalizes GHASH(AAD) */ -		GCM_MUL(ctx,Xi); -		ctx->ares = 0; -	} +    if (ctx->ares) { +        /* First call to decrypt finalizes GHASH(AAD) */ +        GCM_MUL(ctx, Xi); +        ctx->ares = 0; +    } -	if (is_endian.little) +    if (is_endian.little)  #ifdef BSWAP4 -		ctr = BSWAP4(ctx->Yi.d[3]); +        ctr = BSWAP4(ctx->Yi.d[3]);  #else -		ctr = GETU32(ctx->Yi.c+12); -#endif -	else -		ctr = ctx->Yi.d[3]; - -	n = ctx->mres; -	if (n) { -		while (n && len) { -			u8 c = *(in++); -			*(out++) = c^ctx->EKi.c[n]; -			ctx->Xi.c[n] ^= c; -			--len; -			n = (n+1)%16; -		} -		if (n==0) GCM_MUL (ctx,Xi); -		else { -			ctx->mres = n; -			return 0; -		} -	} +        ctr = GETU32(ctx->Yi.c + 12); +#endif +    else +        ctr = ctx->Yi.d[3]; + +    n = ctx->mres; +    if (n) { +        while (n && len) { +            u8 c = *(in++); +            *(out++) = c ^ ctx->EKi.c[n]; +            ctx->Xi.c[n] ^= c; +            --len; +            n = (n + 1) % 16; +        } +        if (n == 0) +            GCM_MUL(ctx, Xi); +        else { +            ctx->mres = n; +            return 0; +        } +    }  #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) -	while (len>=GHASH_CHUNK) { -		GHASH(ctx,in,GHASH_CHUNK); -		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); -		ctr += GHASH_CHUNK/16; -		if (is_endian.little) -#ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); -#else -			PUTU32(ctx->Yi.c+12,ctr); -#endif -		else -			ctx->Yi.d[3] = ctr; -		out += GHASH_CHUNK; -		in  += GHASH_CHUNK; -		len -= GHASH_CHUNK; -	} +    while (len >= GHASH_CHUNK) { +        GHASH(ctx, in, GHASH_CHUNK); +        (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); +        ctr += GHASH_CHUNK / 16; +        if (is_endian.little) +# ifdef BSWAP4 +            ctx->Yi.d[3] = BSWAP4(ctr); +# else +            PUTU32(ctx->Yi.c + 12, ctr); +# endif +        else +            ctx->Yi.d[3] = ctr; +        out += GHASH_CHUNK; +        in += GHASH_CHUNK; +        len -= GHASH_CHUNK; +    }  #endif -	if ((i = (len&(size_t)-16))) { -		size_t j=i/16; +    if ((i = (len & (size_t)-16))) { +        size_t j = i / 16;  #if defined(GHASH) -		GHASH(ctx,in,i); +        GHASH(ctx, in, i);  #else -		while (j--) { -			size_t k; -			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; -			GCM_MUL(ctx,Xi); -			in += 16; -		} -		j   = i/16; -		in -= i; -#endif -		(*stream)(in,out,j,key,ctx->Yi.c); -		ctr += (unsigned int)j; -		if (is_endian.little) +        while (j--) { +            size_t k; +            for (k = 0; k < 16; ++k) +                ctx->Xi.c[k] ^= in[k]; +            GCM_MUL(ctx, Xi); +            in += 16; +        } +        j = i / 16; +        in -= i; +#endif +        (*stream) (in, out, j, key, ctx->Yi.c); +        ctr += (unsigned int)j; +        if (is_endian.little)  #ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); +            ctx->Yi.d[3] = BSWAP4(ctr);  #else -			PUTU32(ctx->Yi.c+12,ctr); +            PUTU32(ctx->Yi.c + 12, ctr);  #endif -		else -			ctx->Yi.d[3] = ctr; -		out += i; -		in  += i; -		len -= i; -	} -	if (len) { -		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); -		++ctr; -		if (is_endian.little) +        else +            ctx->Yi.d[3] = ctr; +        out += i; +        in += i; +        len -= i; +    } +    if (len) { +        (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key); +        ++ctr; +        if (is_endian.little)  #ifdef BSWAP4 -			ctx->Yi.d[3] = BSWAP4(ctr); +            ctx->Yi.d[3] = BSWAP4(ctr);  #else -			PUTU32(ctx->Yi.c+12,ctr); -#endif -		else -			ctx->Yi.d[3] = ctr; -		while (len--) { -			u8 c = in[n]; -			ctx->Xi.c[n] ^= c; -			out[n] = c^ctx->EKi.c[n]; -			++n; -		} -	} - -	ctx->mres = n; -	return 0; +            PUTU32(ctx->Yi.c + 12, ctr); +#endif +        else +            ctx->Yi.d[3] = ctr; +        while (len--) { +            u8 c = in[n]; +            ctx->Xi.c[n] ^= c; +            out[n] = c ^ ctx->EKi.c[n]; +            ++n; +        } +    } + +    ctx->mres = n; +    return 0;  } -int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, -			size_t len) +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag, +                         size_t len)  { -	const union { long one; char little; } is_endian = {1}; -	u64 alen = ctx->len.u[0]<<3; -	u64 clen = ctx->len.u[1]<<3; +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    u64 alen = ctx->len.u[0] << 3; +    u64 clen = ctx->len.u[1] << 3;  #ifdef GCM_FUNCREF_4BIT -	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult; +    void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;  #endif -	if (ctx->mres || ctx->ares) -		GCM_MUL(ctx,Xi); +    if (ctx->mres || ctx->ares) +        GCM_MUL(ctx, Xi); -	if (is_endian.little) { +    if (is_endian.little) {  #ifdef BSWAP8 -		alen = BSWAP8(alen); -		clen = BSWAP8(clen); +        alen = BSWAP8(alen); +        clen = BSWAP8(clen);  #else -		u8 *p = ctx->len.c; +        u8 *p = ctx->len.c; -		ctx->len.u[0] = alen; -		ctx->len.u[1] = clen; +        ctx->len.u[0] = alen; +        ctx->len.u[1] = clen; -		alen = (u64)GETU32(p)  <<32|GETU32(p+4); -		clen = (u64)GETU32(p+8)<<32|GETU32(p+12); +        alen = (u64)GETU32(p) << 32 | GETU32(p + 4); +        clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);  #endif -	} +    } -	ctx->Xi.u[0] ^= alen; -	ctx->Xi.u[1] ^= clen; -	GCM_MUL(ctx,Xi); +    ctx->Xi.u[0] ^= alen; +    ctx->Xi.u[1] ^= clen; +    GCM_MUL(ctx, Xi); -	ctx->Xi.u[0] ^= ctx->EK0.u[0]; -	ctx->Xi.u[1] ^= ctx->EK0.u[1]; +    ctx->Xi.u[0] ^= ctx->EK0.u[0]; +    ctx->Xi.u[1] ^= ctx->EK0.u[1]; -	if (tag && len<=sizeof(ctx->Xi)) -		return memcmp(ctx->Xi.c,tag,len); -	else -		return -1; +    if (tag && len <= sizeof(ctx->Xi)) +        return memcmp(ctx->Xi.c, tag, len); +    else +        return -1;  }  void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)  { -	CRYPTO_gcm128_finish(ctx, NULL, 0); -	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); +    CRYPTO_gcm128_finish(ctx, NULL, 0); +    memcpy(tag, ctx->Xi.c, +           len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));  }  GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)  { -	GCM128_CONTEXT *ret; +    GCM128_CONTEXT *ret; -	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) -		CRYPTO_gcm128_init(ret,key,block); +    if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) +        CRYPTO_gcm128_init(ret, key, block); -	return ret; +    return ret;  }  void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)  { -	if (ctx) { -		OPENSSL_cleanse(ctx,sizeof(*ctx)); -		OPENSSL_free(ctx); -	} +    if (ctx) { +        OPENSSL_cleanse(ctx, sizeof(*ctx)); +        OPENSSL_free(ctx); +    }  }  #if defined(SELFTEST) -#include <stdio.h> -#include <openssl/aes.h> +# include <stdio.h> +# include <openssl/aes.h>  /* Test Case 1 */ -static const u8	K1[16], -		*P1=NULL, -		*A1=NULL, -		IV1[12], -		*C1=NULL, -		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a}; +static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL; +static const u8 T1[] = { +    0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61, +    0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a +};  /* Test Case 2 */ -#define K2 K1 -#define A2 A1 -#define IV2 IV1 -static const u8	P2[16], -		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78}, -		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf}; +# define K2 K1 +# define A2 A1 +# define IV2 IV1 +static const u8 P2[16]; +static const u8 C2[] = { +    0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, +    0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78 +}; + +static const u8 T2[] = { +    0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, +    0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf +};  /* Test Case 3 */ -#define A3 A2 -static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, -		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, -		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, -		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, -			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, -			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, -			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85}, -		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4}; +# define A3 A2 +static const u8 K3[] = { +    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, +    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 +}; + +static const u8 P3[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV3[] = { +    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, +    0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C3[] = { +    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, +    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, +    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, +    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, +    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, +    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, +    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, +    0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85 +}; + +static const u8 T3[] = { +    0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, +    0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4 +};  /* Test Case 4 */ -#define K4 K3 -#define IV4 IV3 -static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, -		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, -			0xab,0xad,0xda,0xd2}, -		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, -			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, -			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, -			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91}, -		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47}; +# define K4 K3 +# define IV4 IV3 +static const u8 P4[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A4[] = { +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C4[] = { +    0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, +    0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, +    0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, +    0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, +    0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, +    0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, +    0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, +    0x3d, 0x58, 0xe0, 0x91 +}; + +static const u8 T4[] = { +    0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, +    0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47 +};  /* Test Case 5 */ -#define K5 K4 -#define P5 P4 -#define A5 A4 -static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, -		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55, -			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23, -			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42, -			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98}, -		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb}; +# define K5 K4 +# define P5 P4 +# define A5 A4 +static const u8 IV5[] = { +    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad +}; + +static const u8 C5[] = { +    0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a, +    0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55, +    0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8, +    0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23, +    0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2, +    0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42, +    0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07, +    0xc2, 0x3f, 0x45, 0x98 +}; + +static const u8 T5[] = { +    0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85, +    0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb +};  /* Test Case 6 */ -#define K6 K5 -#define P6 P5 -#define A6 A5 -static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, -			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, -			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, -			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, -		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94, -			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7, -			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f, -			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5}, -		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50}; +# define K6 K5 +# define P6 P5 +# define A6 A5 +static const u8 IV6[] = { +    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, +    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, +    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, +    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, +    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, +    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, +    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, +    0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C6[] = { +    0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6, +    0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94, +    0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8, +    0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7, +    0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90, +    0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f, +    0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03, +    0x4c, 0x34, 0xae, 0xe5 +}; + +static const u8 T6[] = { +    0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa, +    0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50 +};  /* Test Case 7 */ -static const u8 K7[24], -		*P7=NULL, -		*A7=NULL, -		IV7[12], -		*C7=NULL, -		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35}; +static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL; +static const u8 T7[] = { +    0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b, +    0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35 +};  /* Test Case 8 */ -#define K8 K7 -#define IV8 IV7 -#define A8 A7 -static const u8	P8[16], -		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00}, -		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb}; +# define K8 K7 +# define IV8 IV7 +# define A8 A7 +static const u8 P8[16]; +static const u8 C8[] = { +    0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41, +    0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00 +}; + +static const u8 T8[] = { +    0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab, +    0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb +};  /* Test Case 9 */ -#define A9 A8 -static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, -			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c}, -		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, -		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, -		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, -			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, -			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, -			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56}, -		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14}; +# define A9 A8 +static const u8 K9[] = { +    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, +    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, +    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c +}; + +static const u8 P9[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV9[] = { +    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, +    0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C9[] = { +    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41, +    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57, +    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84, +    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c, +    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25, +    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47, +    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9, +    0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56 +}; + +static const u8 T9[] = { +    0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf, +    0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14 +};  /* Test Case 10 */ -#define K10 K9 -#define IV10 IV9 -static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, -		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, -			0xab,0xad,0xda,0xd2}, -		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, -			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, -			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, -			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10}, -		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c}; +# define K10 K9 +# define IV10 IV9 +static const u8 P10[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A10[] = { +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C10[] = { +    0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41, +    0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57, +    0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84, +    0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c, +    0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25, +    0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47, +    0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9, +    0xcc, 0xda, 0x27, 0x10 +}; + +static const u8 T10[] = { +    0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f, +    0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c +};  /* Test Case 11 */ -#define K11 K10 -#define P11 P10 -#define A11 A10 -static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, -		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8, -			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57, -			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9, -			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7}, -		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8}; +# define K11 K10 +# define P11 P10 +# define A11 A10 +static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + +static const u8 C11[] = { +    0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54, +    0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8, +    0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f, +    0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57, +    0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75, +    0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9, +    0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f, +    0xa0, 0xf0, 0x62, 0xf7 +}; + +static const u8 T11[] = { +    0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24, +    0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8 +};  /* Test Case 12 */ -#define K12 K11 -#define P12 P11 -#define A12 A11 -static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, -			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, -			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, -			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, -		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff, -			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45, -			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3, -			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b}, -		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9}; +# define K12 K11 +# define P12 P11 +# define A12 A11 +static const u8 IV12[] = { +    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, +    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, +    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, +    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, +    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, +    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, +    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, +    0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C12[] = { +    0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c, +    0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff, +    0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef, +    0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45, +    0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9, +    0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3, +    0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7, +    0xe9, 0xb7, 0x37, 0x3b +}; + +static const u8 T12[] = { +    0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb, +    0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9 +};  /* Test Case 13 */ -static const u8	K13[32], -		*P13=NULL, -		*A13=NULL, -		IV13[12], -		*C13=NULL, -		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b}; +static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL; +static const u8 T13[] = { +    0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, +    0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b +};  /* Test Case 14 */ -#define K14 K13 -#define A14 A13 -static const u8	P14[16], -		IV14[12], -		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18}, -		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19}; +# define K14 K13 +# define A14 A13 +static const u8 P14[16], IV14[12]; +static const u8 C14[] = { +    0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, +    0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18 +}; + +static const u8 T14[] = { +    0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, +    0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19 +};  /* Test Case 15 */ -#define A15 A14 -static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, -			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, -		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, -		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, -		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, -			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, -			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, -			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, -		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c}; +# define A15 A14 +static const u8 K15[] = { +    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, +    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, +    0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, +    0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 +}; + +static const u8 P15[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV15[] = { +    0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, +    0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C15[] = { +    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, +    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, +    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, +    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, +    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, +    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, +    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, +    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; + +static const u8 T15[] = { +    0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, +    0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c +};  /* Test Case 16 */ -#define K16 K15 -#define IV16 IV15 -static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, -		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, -			0xab,0xad,0xda,0xd2}, -		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, -			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, -			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, -			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62}, -		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b}; +# define K16 K15 +# define IV16 IV15 +static const u8 P16[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A16[] = { +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +    0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C16[] = { +    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, +    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, +    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, +    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, +    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, +    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, +    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, +    0xbc, 0xc9, 0xf6, 0x62 +}; + +static const u8 T16[] = { +    0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, +    0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b +};  /* Test Case 17 */ -#define K17 K16 -#define P17 P16 -#define A17 A16 -static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, -		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb, -			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0, -			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78, -			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f}, -		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2}; +# define K17 K16 +# define P17 P16 +# define A17 A16 +static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + +static const u8 C17[] = { +    0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, +    0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb, +    0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, +    0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0, +    0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, +    0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78, +    0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, +    0xf4, 0x7c, 0x9b, 0x1f +}; + +static const u8 T17[] = { +    0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, +    0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2 +};  /* Test Case 18 */ -#define K18 K17 -#define P18 P17 -#define A18 A17 -static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, -			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, -			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, -			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, -		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20, -			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4, -			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde, -			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, -		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; +# define K18 K17 +# define P18 P17 +# define A18 A17 +static const u8 IV18[] = { +    0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, +    0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, +    0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, +    0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, +    0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, +    0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, +    0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, +    0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C18[] = { +    0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, +    0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, +    0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, +    0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4, +    0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, +    0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde, +    0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, +    0x44, 0xae, 0x7e, 0x3f +}; + +static const u8 T18[] = { +    0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0, +    0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a +};  /* Test Case 19 */ -#define K19 K1 -#define P19 P1 -#define IV19 IV1 -#define C19 C1 -static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, -			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, -			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, -			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55, -			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, -			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, -			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, -			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, -		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92}; +# define K19 K1 +# define P19 P1 +# define IV19 IV1 +# define C19 C1 +static const u8 A19[] = { +    0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, +    0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +    0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, +    0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +    0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, +    0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +    0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, +    0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55, +    0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, +    0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, +    0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, +    0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, +    0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, +    0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, +    0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, +    0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; + +static const u8 T19[] = { +    0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d, +    0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92 +};  /* Test Case 20 */ -#define K20 K1 -#define A20 A1 -static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */ -		P20[288], -		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14, -			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f, -			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18, -			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49, -			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c, -			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29, -			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76, -			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce, -			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86, -			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18, -			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42, -			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06, -			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c, -			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64, -			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6, -			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74, -			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46, -			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c}, -		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f}; - -#define TEST_CASE(n)	do {					\ -	u8 out[sizeof(P##n)];					\ -	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\ -	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\ -	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\ -	memset(out,0,sizeof(out));				\ -	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\ -	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\ -	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\ -	    (C##n && memcmp(out,C##n,sizeof(out))))		\ -		ret++, printf ("encrypt test#%d failed.\n",n);	\ -	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\ -	memset(out,0,sizeof(out));				\ -	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\ -	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\ -	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\ -	    (P##n && memcmp(out,P##n,sizeof(out))))		\ -		ret++, printf ("decrypt test#%d failed.\n",n);	\ -	} while(0) +# define K20 K1 +# define A20 A1 +/* this results in 0xff in counter LSB */ +static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff }; + +static const u8 P20[288]; +static const u8 C20[] = { +    0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a, +    0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14, +    0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce, +    0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f, +    0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70, +    0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18, +    0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf, +    0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49, +    0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab, +    0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c, +    0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c, +    0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29, +    0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1, +    0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76, +    0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2, +    0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce, +    0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f, +    0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86, +    0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb, +    0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18, +    0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65, +    0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42, +    0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b, +    0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06, +    0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24, +    0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c, +    0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4, +    0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64, +    0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03, +    0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6, +    0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90, +    0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74, +    0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67, +    0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46, +    0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78, +    0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c +}; + +static const u8 T20[] = { +    0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a, +    0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f +}; + +# define TEST_CASE(n)    do {                                    \ +        u8 out[sizeof(P##n)];                                   \ +        AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \ +        CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \ +        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \ +        memset(out,0,sizeof(out));                              \ +        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \ +        if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \ +        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \ +            (C##n && memcmp(out,C##n,sizeof(out))))             \ +                ret++, printf ("encrypt test#%d failed.\n",n);  \ +        CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \ +        memset(out,0,sizeof(out));                              \ +        if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \ +        if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \ +        if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \ +            (P##n && memcmp(out,P##n,sizeof(out))))             \ +                ret++, printf ("decrypt test#%d failed.\n",n);  \ +        } while(0)  int main()  { -	GCM128_CONTEXT ctx; -	AES_KEY key; -	int ret=0; - -	TEST_CASE(1); -	TEST_CASE(2); -	TEST_CASE(3); -	TEST_CASE(4); -	TEST_CASE(5); -	TEST_CASE(6); -	TEST_CASE(7); -	TEST_CASE(8); -	TEST_CASE(9); -	TEST_CASE(10); -	TEST_CASE(11); -	TEST_CASE(12); -	TEST_CASE(13); -	TEST_CASE(14); -	TEST_CASE(15); -	TEST_CASE(16); -	TEST_CASE(17); -	TEST_CASE(18); -	TEST_CASE(19); -	TEST_CASE(20); - -#ifdef OPENSSL_CPUID_OBJ -	{ -	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); -	union { u64 u; u8 c[1024]; } buf; -	int i; - -	AES_set_encrypt_key(K1,sizeof(K1)*8,&key); -	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); -	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1)); - -	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); -	start = OPENSSL_rdtsc(); -	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); -	gcm_t = OPENSSL_rdtsc() - start; - -	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), -			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, -			(block128_f)AES_encrypt); -	start = OPENSSL_rdtsc(); -	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), -			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, -			(block128_f)AES_encrypt); -	ctr_t = OPENSSL_rdtsc() - start; - -	printf("%.2f-%.2f=%.2f\n", -			gcm_t/(double)sizeof(buf), -			ctr_t/(double)sizeof(buf), -			(gcm_t-ctr_t)/(double)sizeof(buf)); -#ifdef GHASH -	{ -	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], -				const u8 *inp,size_t len)	= ctx.ghash; - -	GHASH((&ctx),buf.c,sizeof(buf)); -	start = OPENSSL_rdtsc(); -	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf)); -	gcm_t = OPENSSL_rdtsc() - start; -	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); -	} -#endif -	} -#endif +    GCM128_CONTEXT ctx; +    AES_KEY key; +    int ret = 0; + +    TEST_CASE(1); +    TEST_CASE(2); +    TEST_CASE(3); +    TEST_CASE(4); +    TEST_CASE(5); +    TEST_CASE(6); +    TEST_CASE(7); +    TEST_CASE(8); +    TEST_CASE(9); +    TEST_CASE(10); +    TEST_CASE(11); +    TEST_CASE(12); +    TEST_CASE(13); +    TEST_CASE(14); +    TEST_CASE(15); +    TEST_CASE(16); +    TEST_CASE(17); +    TEST_CASE(18); +    TEST_CASE(19); +    TEST_CASE(20); + +# ifdef OPENSSL_CPUID_OBJ +    { +        size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc(); +        union { +            u64 u; +            u8 c[1024]; +        } buf; +        int i; + +        AES_set_encrypt_key(K1, sizeof(K1) * 8, &key); +        CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt); +        CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1)); + +        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf)); +        start = OPENSSL_rdtsc(); +        CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf)); +        gcm_t = OPENSSL_rdtsc() - start; + +        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf), +                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres, +                              (block128_f) AES_encrypt); +        start = OPENSSL_rdtsc(); +        CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf), +                              &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres, +                              (block128_f) AES_encrypt); +        ctr_t = OPENSSL_rdtsc() - start; + +        printf("%.2f-%.2f=%.2f\n", +               gcm_t / (double)sizeof(buf), +               ctr_t / (double)sizeof(buf), +               (gcm_t - ctr_t) / (double)sizeof(buf)); +#  ifdef GHASH +        { +            void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], +                                 const u8 *inp, size_t len) = ctx.ghash; + +            GHASH((&ctx), buf.c, sizeof(buf)); +            start = OPENSSL_rdtsc(); +            for (i = 0; i < 100; ++i) +                GHASH((&ctx), buf.c, sizeof(buf)); +            gcm_t = OPENSSL_rdtsc() - start; +            printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i); +        } +#  endif +    } +# endif -	return ret; +    return ret;  }  #endif diff --git a/openssl/crypto/modes/modes.h b/openssl/crypto/modes/modes.h index 7773c2542..fd488499a 100644 --- a/openssl/crypto/modes/modes.h +++ b/openssl/crypto/modes/modes.h @@ -10,132 +10,154 @@  #ifdef  __cplusplus  extern "C" {  #endif -typedef void (*block128_f)(const unsigned char in[16], -			unsigned char out[16], -			const void *key); +typedef void (*block128_f) (const unsigned char in[16], +                            unsigned char out[16], const void *key); -typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], int enc); +typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out, +                          size_t len, const void *key, +                          unsigned char ivec[16], int enc); -typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out, -			size_t blocks, const void *key, -			const unsigned char ivec[16]); +typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out, +                          size_t blocks, const void *key, +                          const unsigned char ivec[16]); -typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out, -			size_t blocks, const void *key, -			const unsigned char ivec[16],unsigned char cmac[16]); +typedef void (*ccm128_f) (const unsigned char *in, unsigned char *out, +                          size_t blocks, const void *key, +                          const unsigned char ivec[16], +                          unsigned char cmac[16]);  void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                           size_t len, const void *key, +                           unsigned char ivec[16], block128_f block);  void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                           size_t len, const void *key, +                           unsigned char ivec[16], block128_f block);  void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], unsigned char ecount_buf[16], -			unsigned int *num, block128_f block); +                           size_t len, const void *key, +                           unsigned char ivec[16], +                           unsigned char ecount_buf[16], unsigned int *num, +                           block128_f block);  void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], unsigned char ecount_buf[16], -			unsigned int *num, ctr128_f ctr); +                                 size_t len, const void *key, +                                 unsigned char ivec[16], +                                 unsigned char ecount_buf[16], +                                 unsigned int *num, ctr128_f ctr);  void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], int *num, -			block128_f block); +                           size_t len, const void *key, +                           unsigned char ivec[16], int *num, +                           block128_f block);  void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block); +                           size_t len, const void *key, +                           unsigned char ivec[16], int *num, +                           int enc, block128_f block);  void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, -			size_t length, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block); +                             size_t length, const void *key, +                             unsigned char ivec[16], int *num, +                             int enc, block128_f block);  void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out, -			size_t bits, const void *key, -			unsigned char ivec[16], int *num, -			int enc, block128_f block); - -size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                             size_t bits, const void *key, +                             unsigned char ivec[16], int *num, +                             int enc, block128_f block); + +size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, +                                   unsigned char *out, size_t len, +                                   const void *key, unsigned char ivec[16], +                                   block128_f block);  size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc); -size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                             size_t len, const void *key, +                             unsigned char ivec[16], cbc128_f cbc); +size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, +                                   unsigned char *out, size_t len, +                                   const void *key, unsigned char ivec[16], +                                   block128_f block);  size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc); - -size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                             size_t len, const void *key, +                             unsigned char ivec[16], cbc128_f cbc); + +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, +                                       unsigned char *out, size_t len, +                                       const void *key, +                                       unsigned char ivec[16], +                                       block128_f block);  size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc); -size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], block128_f block); +                                 size_t len, const void *key, +                                 unsigned char ivec[16], cbc128_f cbc); +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, +                                       unsigned char *out, size_t len, +                                       const void *key, +                                       unsigned char ivec[16], +                                       block128_f block);  size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], cbc128_f cbc); +                                 size_t len, const void *key, +                                 unsigned char ivec[16], cbc128_f cbc);  typedef struct gcm128_context GCM128_CONTEXT;  GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block); -void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block); +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);  void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, -			size_t len); +                         size_t len);  int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, -			size_t len); +                      size_t len);  int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, -			const unsigned char *in, unsigned char *out, -			size_t len); +                          const unsigned char *in, unsigned char *out, +                          size_t len);  int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, -			const unsigned char *in, unsigned char *out, -			size_t len); +                          const unsigned char *in, unsigned char *out, +                          size_t len);  int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, -			const unsigned char *in, unsigned char *out, -			size_t len, ctr128_f stream); +                                const unsigned char *in, unsigned char *out, +                                size_t len, ctr128_f stream);  int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, -			const unsigned char *in, unsigned char *out, -			size_t len, ctr128_f stream); -int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, -			size_t len); +                                const unsigned char *in, unsigned char *out, +                                size_t len, ctr128_f stream); +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag, +                         size_t len);  void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);  void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);  typedef struct ccm128_context CCM128_CONTEXT;  void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, -	unsigned int M, unsigned int L, void *key,block128_f block); -int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, -	const unsigned char *nonce, size_t nlen, size_t mlen); -void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, -	const unsigned char *aad, size_t alen); -int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, size_t len); -int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, size_t len); -int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, size_t len, -	ccm128_f stream); -int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, -	const unsigned char *inp, unsigned char *out, size_t len, -	ccm128_f stream); +                        unsigned int M, unsigned int L, void *key, +                        block128_f block); +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, const unsigned char *nonce, +                        size_t nlen, size_t mlen); +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, const unsigned char *aad, +                       size_t alen); +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, const unsigned char *inp, +                          unsigned char *out, size_t len); +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, const unsigned char *inp, +                          unsigned char *out, size_t len); +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp, +                                unsigned char *out, size_t len, +                                ccm128_f stream); +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp, +                                unsigned char *out, size_t len, +                                ccm128_f stream);  size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);  typedef struct xts128_context XTS128_CONTEXT; -int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], -	const unsigned char *inp, unsigned char *out, size_t len, int enc); +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, +                          const unsigned char iv[16], +                          const unsigned char *inp, unsigned char *out, +                          size_t len, int enc); + +size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, +                       unsigned char *out, +                       const unsigned char *in, size_t inlen, +                       block128_f block); + +size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv, +                         unsigned char *out, +                         const unsigned char *in, size_t inlen, +                         block128_f block); +  #ifdef  __cplusplus  }  #endif diff --git a/openssl/crypto/modes/modes_lcl.h b/openssl/crypto/modes/modes_lcl.h index 9d83e1284..900f54ca2 100644 --- a/openssl/crypto/modes/modes_lcl.h +++ b/openssl/crypto/modes/modes_lcl.h @@ -7,122 +7,137 @@  #include <openssl/modes.h> -  #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)  typedef __int64 i64;  typedef unsigned __int64 u64; -#define U64(C) C##UI64 +# define U64(C) C##UI64  #elif defined(__arch64__)  typedef long i64;  typedef unsigned long u64; -#define U64(C) C##UL +# define U64(C) C##UL  #else  typedef long long i64;  typedef unsigned long long u64; -#define U64(C) C##ULL +# define U64(C) C##ULL  #endif  typedef unsigned int u32;  typedef unsigned char u8;  #define STRICT_ALIGNMENT 1 -#if defined(__i386)	|| defined(__i386__)	|| \ -    defined(__x86_64)	|| defined(__x86_64__)	|| \ -    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \ -    defined(__s390__)	|| defined(__s390x__) -# undef STRICT_ALIGNMENT +#ifndef PEDANTIC +# if defined(__i386)    || defined(__i386__)    || \ +     defined(__x86_64)  || defined(__x86_64__)  || \ +     defined(_M_IX86)   || defined(_M_AMD64)    || defined(_M_X64) || \ +     defined(__aarch64__)                       || \ +     defined(__s390__)  || defined(__s390x__) +#  undef STRICT_ALIGNMENT +# endif  #endif  #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) -#if defined(__GNUC__) && __GNUC__>=2 -# if defined(__x86_64) || defined(__x86_64__) -#  define BSWAP8(x) ({	u64 ret=(x);			\ -			asm ("bswapq %0"		\ -			: "+r"(ret));	ret;		}) -#  define BSWAP4(x) ({	u32 ret=(x);			\ -			asm ("bswapl %0"		\ -			: "+r"(ret));	ret;		}) -# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) -#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\ -			asm ("bswapl %0; bswapl %1"	\ -			: "+r"(hi),"+r"(lo));		\ -			(u64)hi<<32|lo;			}) -#  define BSWAP4(x) ({	u32 ret=(x);			\ -			asm ("bswapl %0"		\ -			: "+r"(ret));	ret;		}) -# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) -#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\ -			asm ("rev %0,%0; rev %1,%1"	\ -			: "+r"(hi),"+r"(lo));		\ -			(u64)hi<<32|lo;			}) -#  define BSWAP4(x) ({	u32 ret;			\ -			asm ("rev %0,%1"		\ -			: "=r"(ret) : "r"((u32)(x)));	\ -			ret;				}) +# if defined(__GNUC__) && __GNUC__>=2 +#  if defined(__x86_64) || defined(__x86_64__) +#   define BSWAP8(x) ({ u64 ret=(x);                    \ +                        asm ("bswapq %0"                \ +                        : "+r"(ret));   ret;            }) +#   define BSWAP4(x) ({ u32 ret=(x);                    \ +                        asm ("bswapl %0"                \ +                        : "+r"(ret));   ret;            }) +#  elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) +#   define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x);     \ +                        asm ("bswapl %0; bswapl %1"     \ +                        : "+r"(hi),"+r"(lo));           \ +                        (u64)hi<<32|lo;                 }) +#   define BSWAP4(x) ({ u32 ret=(x);                    \ +                        asm ("bswapl %0"                \ +                        : "+r"(ret));   ret;            }) +#  elif defined(__aarch64__) +#   define BSWAP8(x) ({ u64 ret;                        \ +                        asm ("rev %0,%1"                \ +                        : "=r"(ret) : "r"(x)); ret;     }) +#   define BSWAP4(x) ({ u32 ret;                        \ +                        asm ("rev %w0,%w1"              \ +                        : "=r"(ret) : "r"(x)); ret;     }) +#  elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) +#   define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x);     \ +                        asm ("rev %0,%0; rev %1,%1"     \ +                        : "+r"(hi),"+r"(lo));           \ +                        (u64)hi<<32|lo;                 }) +#   define BSWAP4(x) ({ u32 ret;                        \ +                        asm ("rev %0,%1"                \ +                        : "=r"(ret) : "r"((u32)(x)));   \ +                        ret;                            }) +#  endif +# elif defined(_MSC_VER) +#  if _MSC_VER>=1300 +#   pragma intrinsic(_byteswap_uint64,_byteswap_ulong) +#   define BSWAP8(x)    _byteswap_uint64((u64)(x)) +#   define BSWAP4(x)    _byteswap_ulong((u32)(x)) +#  elif defined(_M_IX86) +__inline u32 _bswap4(u32 val) +{ +_asm mov eax, val _asm bswap eax} +#   define BSWAP4(x)    _bswap4(x) +#  endif  # endif -#elif defined(_MSC_VER) -# if _MSC_VER>=1300 -#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong) -#  define BSWAP8(x)	_byteswap_uint64((u64)(x)) -#  define BSWAP4(x)	_byteswap_ulong((u32)(x)) -# elif defined(_M_IX86) -   __inline u32 _bswap4(u32 val) { -	_asm mov eax,val -	_asm bswap eax -   } -#  define BSWAP4(x)	_bswap4(x) -# endif -#endif  #endif -  #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) -#define GETU32(p)	BSWAP4(*(const u32 *)(p)) -#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v) +# define GETU32(p)       BSWAP4(*(const u32 *)(p)) +# define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)  #else -#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) -#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) +# define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) +# define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))  #endif +/*- GCM definitions */ typedef struct { +    u64 hi, lo; +} u128; -/* GCM definitions */ - -typedef struct { u64 hi,lo; } u128; - -#ifdef	TABLE_BITS -#undef	TABLE_BITS +#ifdef  TABLE_BITS +# undef  TABLE_BITS  #endif  /*   * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should   * never be set to 8 [or 1]. For further information see gcm128.c.   */ -#define	TABLE_BITS 4 +#define TABLE_BITS 4  struct gcm128_context { -	/* Following 6 names follow names in GCM specification */ -	union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; } -	  Yi,EKi,EK0,len,Xi,H; -	/* Relative position of Xi, H and pre-computed Htable is used -	 * in some assembler modules, i.e. don't change the order! */ +    /* Following 6 names follow names in GCM specification */ +    union { +        u64 u[2]; +        u32 d[4]; +        u8 c[16]; +        size_t t[16 / sizeof(size_t)]; +    } Yi, EKi, EK0, len, Xi, H; +    /* +     * Relative position of Xi, H and pre-computed Htable is used in some +     * assembler modules, i.e. don't change the order! +     */  #if TABLE_BITS==8 -	u128 Htable[256]; +    u128 Htable[256];  #else -	u128 Htable[16]; -	void (*gmult)(u64 Xi[2],const u128 Htable[16]); -	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +    u128 Htable[16]; +    void (*gmult) (u64 Xi[2], const u128 Htable[16]); +    void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp, +                   size_t len);  #endif -	unsigned int mres, ares; -	block128_f block; -	void *key; +    unsigned int mres, ares; +    block128_f block; +    void *key;  };  struct xts128_context { -	void      *key1, *key2; -	block128_f block1,block2; +    void *key1, *key2; +    block128_f block1, block2;  };  struct ccm128_context { -	union { u64 u[2]; u8 c[16]; } nonce, cmac; -	u64 blocks; -	block128_f block; -	void *key; +    union { +        u64 u[2]; +        u8 c[16]; +    } nonce, cmac; +    u64 blocks; +    block128_f block; +    void *key;  }; - diff --git a/openssl/crypto/modes/ofb128.c b/openssl/crypto/modes/ofb128.c index 01c01702c..4dbaccd7a 100644 --- a/openssl/crypto/modes/ofb128.c +++ b/openssl/crypto/modes/ofb128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -59,63 +59,66 @@  #endif  #include <assert.h> -/* The input and output encrypted as though 128bit ofb mode is being - * used.  The extra state information to record how much of the - * 128bit block we have used is contained in *num; +/* + * The input and output encrypted as though 128bit ofb mode is being used. + * The extra state information to record how much of the 128bit block we have + * used is contained in *num;   */  void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, -			size_t len, const void *key, -			unsigned char ivec[16], int *num, -			block128_f block) +                           size_t len, const void *key, +                           unsigned char ivec[16], int *num, block128_f block)  { -	unsigned int n; -	size_t l=0; +    unsigned int n; +    size_t l = 0; -	assert(in && out && key && ivec && num); +    assert(in && out && key && ivec && num); -	n = *num; +    n = *num;  #if !defined(OPENSSL_SMALL_FOOTPRINT) -	if (16%sizeof(size_t) == 0) do { /* always true actually */ -		while (n && len) { -			*(out++) = *(in++) ^ ivec[n]; -			--len; -			n = (n+1) % 16; -		} -#if defined(STRICT_ALIGNMENT) -		if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) -			break; -#endif -		while (len>=16) { -			(*block)(ivec, ivec, key); -			for (; n<16; n+=sizeof(size_t)) -				*(size_t*)(out+n) = -				*(size_t*)(in+n) ^ *(size_t*)(ivec+n); -			len -= 16; -			out += 16; -			in  += 16; -			n = 0; -		} -		if (len) { -			(*block)(ivec, ivec, key); -			while (len--) { -				out[n] = in[n] ^ ivec[n]; -				++n; -			} -		} -		*num = n; -		return; -	} while(0); -	/* the rest would be commonly eliminated by x86* compiler */ +    if (16 % sizeof(size_t) == 0) { /* always true actually */ +        do { +            while (n && len) { +                *(out++) = *(in++) ^ ivec[n]; +                --len; +                n = (n + 1) % 16; +            } +# if defined(STRICT_ALIGNMENT) +            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != +                0) +                break; +# endif +            while (len >= 16) { +                (*block) (ivec, ivec, key); +                for (; n < 16; n += sizeof(size_t)) +                    *(size_t *)(out + n) = +                        *(size_t *)(in + n) ^ *(size_t *)(ivec + n); +                len -= 16; +                out += 16; +                in += 16; +                n = 0; +            } +            if (len) { +                (*block) (ivec, ivec, key); +                while (len--) { +                    out[n] = in[n] ^ ivec[n]; +                    ++n; +                } +            } +            *num = n; +            return; +        } while (0); +    } +    /* the rest would be commonly eliminated by x86* compiler */  #endif -	while (l<len) { -		if (n==0) { -			(*block)(ivec, ivec, key); -		} -		out[l] = in[l] ^ ivec[n]; -		++l; -		n = (n+1) % 16; -	} +    while (l < len) { +        if (n == 0) { +            (*block) (ivec, ivec, key); +        } +        out[l] = in[l] ^ ivec[n]; +        ++l; +        n = (n + 1) % 16; +    } -	*num=n; +    *num = n;  } diff --git a/openssl/crypto/modes/wrap128.c b/openssl/crypto/modes/wrap128.c new file mode 100755 index 000000000..4dcaf0326 --- /dev/null +++ b/openssl/crypto/modes/wrap128.c @@ -0,0 +1,138 @@ +/* crypto/modes/wrap128.c */ +/* + * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2013 The OpenSSL Project.  All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in + *    the documentation and/or other materials provided with the + *    distribution. + * + * 3. All advertising materials mentioning features or use of this + *    software must display the following acknowledgment: + *    "This product includes software developed by the OpenSSL Project + *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + *    endorse or promote products derived from this software without + *    prior written permission. For written permission, please contact + *    licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + *    nor may "OpenSSL" appear in their names without prior written + *    permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + *    acknowledgment: + *    "This product includes software developed by the OpenSSL Project + *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include "cryptlib.h" +#include <openssl/modes.h> + +static const unsigned char default_iv[] = { +    0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, +}; + +/* + * Input size limit: lower than maximum of standards but far larger than + * anything that will be used in practice. + */ +#define CRYPTO128_WRAP_MAX (1UL << 31) + +size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, +                       unsigned char *out, +                       const unsigned char *in, size_t inlen, +                       block128_f block) +{ +    unsigned char *A, B[16], *R; +    size_t i, j, t; +    if ((inlen & 0x7) || (inlen < 8) || (inlen > CRYPTO128_WRAP_MAX)) +        return 0; +    A = B; +    t = 1; +    memcpy(out + 8, in, inlen); +    if (!iv) +        iv = default_iv; + +    memcpy(A, iv, 8); + +    for (j = 0; j < 6; j++) { +        R = out + 8; +        for (i = 0; i < inlen; i += 8, t++, R += 8) { +            memcpy(B + 8, R, 8); +            block(B, B, key); +            A[7] ^= (unsigned char)(t & 0xff); +            if (t > 0xff) { +                A[6] ^= (unsigned char)((t >> 8) & 0xff); +                A[5] ^= (unsigned char)((t >> 16) & 0xff); +                A[4] ^= (unsigned char)((t >> 24) & 0xff); +            } +            memcpy(R, B + 8, 8); +        } +    } +    memcpy(out, A, 8); +    return inlen + 8; +} + +size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv, +                         unsigned char *out, +                         const unsigned char *in, size_t inlen, +                         block128_f block) +{ +    unsigned char *A, B[16], *R; +    size_t i, j, t; +    inlen -= 8; +    if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX)) +        return 0; +    A = B; +    t = 6 * (inlen >> 3); +    memcpy(A, in, 8); +    memcpy(out, in + 8, inlen); +    for (j = 0; j < 6; j++) { +        R = out + inlen - 8; +        for (i = 0; i < inlen; i += 8, t--, R -= 8) { +            A[7] ^= (unsigned char)(t & 0xff); +            if (t > 0xff) { +                A[6] ^= (unsigned char)((t >> 8) & 0xff); +                A[5] ^= (unsigned char)((t >> 16) & 0xff); +                A[4] ^= (unsigned char)((t >> 24) & 0xff); +            } +            memcpy(B + 8, R, 8); +            block(B, B, key); +            memcpy(R, B + 8, 8); +        } +    } +    if (!iv) +        iv = default_iv; +    if (memcmp(A, iv, 8)) { +        OPENSSL_cleanse(out, inlen); +        return 0; +    } +    return inlen; +} diff --git a/openssl/crypto/modes/xts128.c b/openssl/crypto/modes/xts128.c index 9cf27a25e..8f2af588b 100644 --- a/openssl/crypto/modes/xts128.c +++ b/openssl/crypto/modes/xts128.c @@ -6,7 +6,7 @@   * are met:   *   * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer.  + *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright   *    notice, this list of conditions and the following disclaimer in @@ -58,130 +58,147 @@  #endif  #include <assert.h> -int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], -	const unsigned char *inp, unsigned char *out, -	size_t len, int enc) +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, +                          const unsigned char iv[16], +                          const unsigned char *inp, unsigned char *out, +                          size_t len, int enc)  { -	const union { long one; char little; } is_endian = {1}; -	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch; -	unsigned int i; - -	if (len<16) return -1; - -	memcpy(tweak.c, iv, 16); - -	(*ctx->block2)(tweak.c,tweak.c,ctx->key2); - -	if (!enc && (len%16)) len-=16; - -	while (len>=16) { +    const union { +        long one; +        char little; +    } is_endian = { +        1 +    }; +    union { +        u64 u[2]; +        u32 d[4]; +        u8 c[16]; +    } tweak, scratch; +    unsigned int i; + +    if (len < 16) +        return -1; + +    memcpy(tweak.c, iv, 16); + +    (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + +    if (!enc && (len % 16)) +        len -= 16; + +    while (len >= 16) {  #if defined(STRICT_ALIGNMENT) -		memcpy(scratch.c,inp,16); -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; +        memcpy(scratch.c, inp, 16); +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1];  #else -		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0]; -		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1]; +        scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0]; +        scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];  #endif -		(*ctx->block1)(scratch.c,scratch.c,ctx->key1); +        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);  #if defined(STRICT_ALIGNMENT) -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; -		memcpy(out,scratch.c,16); +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1]; +        memcpy(out, scratch.c, 16);  #else -		((u64*)out)[0] = scratch.u[0]^=tweak.u[0]; -		((u64*)out)[1] = scratch.u[1]^=tweak.u[1]; +        ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0]; +        ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];  #endif -		inp += 16; -		out += 16; -		len -= 16; - -		if (len==0)	return 0; - -		if (is_endian.little) { -			unsigned int carry,res; -			 -			res = 0x87&(((int)tweak.d[3])>>31); -			carry = (unsigned int)(tweak.u[0]>>63); -			tweak.u[0] = (tweak.u[0]<<1)^res; -			tweak.u[1] = (tweak.u[1]<<1)|carry; -		} -		else { -			size_t c; - -			for (c=0,i=0;i<16;++i) { -				/*+ substitutes for |, because c is 1 bit */  -				c += ((size_t)tweak.c[i])<<1; -				tweak.c[i] = (u8)c; -				c = c>>8; -			} -			tweak.c[0] ^= (u8)(0x87&(0-c)); -		} -	} -	if (enc) { -		for (i=0;i<len;++i) { -			u8 c = inp[i]; -			out[i] = scratch.c[i]; -			scratch.c[i] = c; -		} -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; -		(*ctx->block1)(scratch.c,scratch.c,ctx->key1); -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; -		memcpy(out-16,scratch.c,16); -	} -	else { -		union { u64 u[2]; u8 c[16]; } tweak1; - -		if (is_endian.little) { -			unsigned int carry,res; - -			res = 0x87&(((int)tweak.d[3])>>31); -			carry = (unsigned int)(tweak.u[0]>>63); -			tweak1.u[0] = (tweak.u[0]<<1)^res; -			tweak1.u[1] = (tweak.u[1]<<1)|carry; -		} -		else { -			size_t c; - -			for (c=0,i=0;i<16;++i) { -				/*+ substitutes for |, because c is 1 bit */  -				c += ((size_t)tweak.c[i])<<1; -				tweak1.c[i] = (u8)c; -				c = c>>8; -			} -			tweak1.c[0] ^= (u8)(0x87&(0-c)); -		} +        inp += 16; +        out += 16; +        len -= 16; + +        if (len == 0) +            return 0; + +        if (is_endian.little) { +            unsigned int carry, res; + +            res = 0x87 & (((int)tweak.d[3]) >> 31); +            carry = (unsigned int)(tweak.u[0] >> 63); +            tweak.u[0] = (tweak.u[0] << 1) ^ res; +            tweak.u[1] = (tweak.u[1] << 1) | carry; +        } else { +            size_t c; + +            for (c = 0, i = 0; i < 16; ++i) { +                /* +                 * + substitutes for |, because c is 1 bit +                 */ +                c += ((size_t)tweak.c[i]) << 1; +                tweak.c[i] = (u8)c; +                c = c >> 8; +            } +            tweak.c[0] ^= (u8)(0x87 & (0 - c)); +        } +    } +    if (enc) { +        for (i = 0; i < len; ++i) { +            u8 c = inp[i]; +            out[i] = scratch.c[i]; +            scratch.c[i] = c; +        } +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1]; +        (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1]; +        memcpy(out - 16, scratch.c, 16); +    } else { +        union { +            u64 u[2]; +            u8 c[16]; +        } tweak1; + +        if (is_endian.little) { +            unsigned int carry, res; + +            res = 0x87 & (((int)tweak.d[3]) >> 31); +            carry = (unsigned int)(tweak.u[0] >> 63); +            tweak1.u[0] = (tweak.u[0] << 1) ^ res; +            tweak1.u[1] = (tweak.u[1] << 1) | carry; +        } else { +            size_t c; + +            for (c = 0, i = 0; i < 16; ++i) { +                /* +                 * + substitutes for |, because c is 1 bit +                 */ +                c += ((size_t)tweak.c[i]) << 1; +                tweak1.c[i] = (u8)c; +                c = c >> 8; +            } +            tweak1.c[0] ^= (u8)(0x87 & (0 - c)); +        }  #if defined(STRICT_ALIGNMENT) -		memcpy(scratch.c,inp,16); -		scratch.u[0] ^= tweak1.u[0]; -		scratch.u[1] ^= tweak1.u[1]; +        memcpy(scratch.c, inp, 16); +        scratch.u[0] ^= tweak1.u[0]; +        scratch.u[1] ^= tweak1.u[1];  #else -		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0]; -		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1]; +        scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0]; +        scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];  #endif -		(*ctx->block1)(scratch.c,scratch.c,ctx->key1); -		scratch.u[0] ^= tweak1.u[0]; -		scratch.u[1] ^= tweak1.u[1]; - -		for (i=0;i<len;++i) { -			u8 c = inp[16+i]; -			out[16+i] = scratch.c[i]; -			scratch.c[i] = c; -		} -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; -		(*ctx->block1)(scratch.c,scratch.c,ctx->key1); +        (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +        scratch.u[0] ^= tweak1.u[0]; +        scratch.u[1] ^= tweak1.u[1]; + +        for (i = 0; i < len; ++i) { +            u8 c = inp[16 + i]; +            out[16 + i] = scratch.c[i]; +            scratch.c[i] = c; +        } +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1]; +        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);  #if defined(STRICT_ALIGNMENT) -		scratch.u[0] ^= tweak.u[0]; -		scratch.u[1] ^= tweak.u[1]; -		memcpy (out,scratch.c,16); +        scratch.u[0] ^= tweak.u[0]; +        scratch.u[1] ^= tweak.u[1]; +        memcpy(out, scratch.c, 16);  #else -		((u64*)out)[0] = scratch.u[0]^tweak.u[0]; -		((u64*)out)[1] = scratch.u[1]^tweak.u[1]; +        ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0]; +        ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];  #endif -	} +    } -	return 0; +    return 0;  } | 
