aboutsummaryrefslogtreecommitdiff
path: root/openssl/crypto/modes
diff options
context:
space:
mode:
authormarha <marha@users.sourceforge.net>2015-02-22 21:39:56 +0100
committermarha <marha@users.sourceforge.net>2015-02-22 21:39:56 +0100
commit462f18c7b25fe3e467f837647d07ab0a78aa8d2b (patch)
treefc8013c0a1bac05a1945846c1697e973f4c35013 /openssl/crypto/modes
parent36f711ee12b6dd5184198abed3aa551efb585587 (diff)
downloadvcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.tar.gz
vcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.tar.bz2
vcxsrv-462f18c7b25fe3e467f837647d07ab0a78aa8d2b.zip
Merged origin/release (checked in because wanted to merge new stuff)
Diffstat (limited to 'openssl/crypto/modes')
-rw-r--r--openssl/crypto/modes/Makefile24
-rwxr-xr-xopenssl/crypto/modes/asm/aesni-gcm-x86_64.pl1057
-rw-r--r--openssl/crypto/modes/asm/ghash-armv4.pl232
-rw-r--r--openssl/crypto/modes/asm/ghash-s390x.pl6
-rw-r--r--openssl/crypto/modes/asm/ghash-sparcv9.pl247
-rw-r--r--openssl/crypto/modes/asm/ghash-x86.pl199
-rw-r--r--openssl/crypto/modes/asm/ghash-x86_64.pl1149
-rwxr-xr-xopenssl/crypto/modes/asm/ghashp8-ppc.pl234
-rwxr-xr-xopenssl/crypto/modes/asm/ghashv8-armx.pl241
-rw-r--r--openssl/crypto/modes/cbc128.c252
-rw-r--r--openssl/crypto/modes/ccm128.c682
-rw-r--r--openssl/crypto/modes/cfb128.c292
-rw-r--r--openssl/crypto/modes/ctr128.c354
-rw-r--r--openssl/crypto/modes/cts128.c707
-rw-r--r--openssl/crypto/modes/gcm128.c3478
-rw-r--r--openssl/crypto/modes/modes.h194
-rw-r--r--openssl/crypto/modes/modes_lcl.h171
-rw-r--r--openssl/crypto/modes/ofb128.c105
-rwxr-xr-xopenssl/crypto/modes/wrap128.c138
-rw-r--r--openssl/crypto/modes/xts128.c243
20 files changed, 6834 insertions, 3171 deletions
diff --git a/openssl/crypto/modes/Makefile b/openssl/crypto/modes/Makefile
index 3d8bafd57..cbcbfad4b 100644
--- a/openssl/crypto/modes/Makefile
+++ b/openssl/crypto/modes/Makefile
@@ -22,9 +22,9 @@ APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
- ccm128.c xts128.c
+ ccm128.c xts128.c wrap128.c
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
- ccm128.o xts128.o $(MODES_ASM_OBJ)
+ ccm128.o xts128.o wrap128.o $(MODES_ASM_OBJ)
SRC= $(LIBSRC)
@@ -50,20 +50,26 @@ ghash-x86.s: asm/ghash-x86.pl
$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
ghash-x86_64.s: asm/ghash-x86_64.pl
$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-gcm-x86_64.s: asm/aesni-gcm-x86_64.pl
+ $(PERL) asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) > $@
ghash-sparcv9.s: asm/ghash-sparcv9.pl
$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
ghash-alpha.s: asm/ghash-alpha.pl
- (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
+ (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) asm/ghash-alpha.pl > $$preproc && \
- $(CC) -E $$preproc > $@ && rm $$preproc)
-
+ $(CC) -E -P $$preproc > $@ && rm $$preproc)
ghash-parisc.s: asm/ghash-parisc.pl
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S: asm/ghashv8-armx.pl
+ $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
+ghashp8-ppc.s: asm/ghashp8-ppc.pl
+ $(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-armv4.o: ghash-armv4.S
+ghashv8-armx.o: ghashv8-armx.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -137,6 +143,14 @@ ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
+wrap128.o: ../../e_os.h ../../include/openssl/bio.h
+wrap128.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+wrap128.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+wrap128.o: ../../include/openssl/lhash.h ../../include/openssl/modes.h
+wrap128.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
+wrap128.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
+wrap128.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+wrap128.o: ../cryptlib.h wrap128.c
xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
diff --git a/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100755
index 000000000..7e4e04ea2
--- /dev/null
+++ b/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ sub \$6,$len
+ vpxor $Z0,$Z0,$Z0 # $Z0 = 0
+ vmovdqu 0x00-0x80($key),$rndkey
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpaddb $T2,$inout2,$inout3
+ vpaddb $T2,$inout3,$inout4
+ vpaddb $T2,$inout4,$inout5
+ vpxor $rndkey,$T1,$inout0
+ vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32 # discard $inout[1-5]?
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddb $T2,$inout5,$T1 # next counter value
+ vpxor $rndkey,$inout1,$inout1
+ vpxor $rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+ vmovdqu $T1,($ivp) # save next counter value
+ vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
+ vpxor $rndkey,$inout3,$inout3
+ vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
+ vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
+ xor %r12,%r12
+ cmp $in0,$end0
+
+ vaesenc $T2,$inout0,$inout0
+ vmovdqu 0x30+8(%rsp),$Ii # I[4]
+ vpxor $rndkey,$inout4,$inout4
+ vpclmulqdq \$0x00,$Hkey,$Z3,$T1
+ vaesenc $T2,$inout1,$inout1
+ vpxor $rndkey,$inout5,$inout5
+ setnc %r12b
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vaesenc $T2,$inout2,$inout2
+ vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
+ neg %r12
+ vaesenc $T2,$inout3,$inout3
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+ vaesenc $T2,$inout4,$inout4
+ vpxor $Z1,$T1,$Z0
+ and \$0x60,%r12
+ vmovups 0x20-0x80($key),$rndkey
+ vpclmulqdq \$0x10,$Hkey,$Ii,$T1
+ vaesenc $T2,$inout5,$inout5
+
+ vpclmulqdq \$0x01,$Hkey,$Ii,$T2
+ lea ($in0,%r12),$in0
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
+ vmovdqu 0x40+8(%rsp),$Ii # I[3]
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x58($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x50($in0),%r12
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x20+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x28+8(%rsp)
+ vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x30-0x80($key),$rndkey
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Z1,$Ii,$T1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x10,$Z1,$Ii,$T2
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Hkey,$Z3,$Z3
+ vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
+ vaesenc $rndkey,$inout2,$inout2
+ vpclmulqdq \$0x11,$Z1,$Ii,$Z1
+ vmovdqu 0x50+8(%rsp),$Ii # I[2]
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor $T1,$Z0,$Z0
+ vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x40-0x80($key),$rndkey
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x00,$T1,$Ii,$T2
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x10,$T1,$Ii,$Hkey
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x48($in0),%r13
+ vpxor $Z1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T1,$Ii,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x40($in0),%r12
+ vpclmulqdq \$0x11,$T1,$Ii,$T1
+ vmovdqu 0x60+8(%rsp),$Ii # I[1]
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x30+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x38+8(%rsp)
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x50-0x80($key),$rndkey
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x00,$T2,$Ii,$Hkey
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$T2,$Ii,$Z1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x38($in0),%r13
+ vpxor $T1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T2,$Ii,$T1
+ vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x30($in0),%r12
+ vpclmulqdq \$0x11,$T2,$Ii,$T2
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x40+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x48+8(%rsp)
+ vpxor $Hkey,$Z0,$Z0
+ vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x60-0x80($key),$rndkey
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x01,$Hkey,$Xi,$T1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x28($in0),%r13
+ vpxor $T2,$Z3,$Z3
+ vpclmulqdq \$0x00,$Hkey,$Xi,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x20($in0),%r12
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x50+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x58+8(%rsp)
+ vpxor $Z1,$Z2,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor $T1,$Z2,$Z2
+
+ vmovups 0x70-0x80($key),$rndkey
+ vpslldq \$8,$Z2,$Z1
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Xi,$Z3,$Z3
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Z1,$Z0,$Z0
+ movbe 0x18($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x10($in0),%r12
+ vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ mov %r13,0x60+8(%rsp)
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r12,0x68+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vmovups 0x90-0x80($key),$rndkey
+ vaesenc $T1,$inout1,$inout1
+ vpsrldq \$8,$Z2,$Z2
+ vaesenc $T1,$inout2,$inout2
+ vpxor $Z2,$Z3,$Z3
+ vaesenc $T1,$inout3,$inout3
+ vpxor $Ii,$Z0,$Z0
+ movbe 0x08($in0),%r13
+ vaesenc $T1,$inout4,$inout4
+ movbe 0x00($in0),%r12
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xa0-0x80($key),$T1
+ cmp \$11,$rounds
+ jb .Lenc_tail # 128-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xb0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xc0-0x80($key),$T1
+ je .Lenc_tail # 192-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xd0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xe0-0x80($key),$T1
+ jmp .Lenc_tail # 256-bit key
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $rndkey,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $rndkey,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpshufb $Ii,$inout5,$inout5
+ vpshufb $Ii,$T1,$T1 # next counter value
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc $rndkey,$inout0,$inout0
+ vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
+ vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
+ vaesenc $rndkey,$inout1,$inout1
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ vpxor 0x00($inp),$T1,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x10($inp),$T1,$Ii
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x20($inp),$T1,$Z1
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x30($inp),$T1,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x40($inp),$T1,$Z3
+ vpxor 0x50($inp),$T1,$Hkey
+ vmovdqu ($ivp),$T1 # load next counter value
+
+ vaesenclast $T2,$inout0,$inout0
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ vaesenclast $Ii,$inout1,$inout1
+ vpaddb $T2,$T1,$Ii
+ mov %r13,0x70+8(%rsp)
+ lea 0x60($inp),$inp
+ vaesenclast $Z1,$inout2,$inout2
+ vpaddb $T2,$Ii,$Z1
+ mov %r12,0x78+8(%rsp)
+ lea 0x60($out),$out
+ vmovdqu 0x00-0x80($key),$rndkey
+ vaesenclast $Z2,$inout3,$inout3
+ vpaddb $T2,$Z1,$Z2
+ vaesenclast $Z3, $inout4,$inout4
+ vpaddb $T2,$Z2,$Z3
+ vaesenclast $Hkey,$inout5,$inout5
+ vpaddb $T2,$Z3,$Hkey
+
+ add \$0x60,$ret
+ sub \$0x6,$len
+ jc .L6x_done
+
+ vmovups $inout0,-0x60($out) # save output
+ vpxor $rndkey,$T1,$inout0
+ vmovups $inout1,-0x50($out)
+ vmovdqa $Ii,$inout1 # 0 latency
+ vmovups $inout2,-0x40($out)
+ vmovdqa $Z1,$inout2 # 0 latency
+ vmovups $inout3,-0x30($out)
+ vmovdqa $Z2,$inout3 # 0 latency
+ vmovups $inout4,-0x20($out)
+ vmovdqa $Z3,$inout4 # 0 latency
+ vmovups $inout5,-0x10($out)
+ vmovdqa $Hkey,$inout5 # 0 latency
+ vmovdqu 0x20+8(%rsp),$Z3 # I[5]
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+
+ ret
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+# const AES_KEY *key, unsigned char iv[16],
+# struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@function,6
+.align 32
+aesni_gcm_decrypt:
+ xor $ret,$ret
+ cmp \$0x60,$len # minimal accepted length
+ jb .Lgcm_dec_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ vmovdqu ($Xip),$Xi # load Xi
+ and \$-128,%rsp # ensure stack alignment
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ lea 0x80($key),$key # size optimization
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ mov 0xf0-0x80($key),$rounds
+ vpshufb $Ii,$Xi,$Xi
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Ldec_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Ldec_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+ vmovdqu 0x50($inp),$Z3 # I[5]
+ lea ($inp),$in0
+ vmovdqu 0x40($inp),$Z0
+ lea -0xc0($inp,$len),$end0
+ vmovdqu 0x30($inp),$Z1
+ shr \$4,$len
+ xor $ret,$ret
+ vmovdqu 0x20($inp),$Z2
+ vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu 0x10($inp),$T2
+ vpshufb $Ii,$Z0,$Z0
+ vmovdqu ($inp),$Hkey
+ vpshufb $Ii,$Z1,$Z1
+ vmovdqu $Z0,0x30(%rsp)
+ vpshufb $Ii,$Z2,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$T2,$T2
+ vmovdqu $Z2,0x50(%rsp)
+ vpshufb $Ii,$Hkey,$Hkey
+ vmovdqu $T2,0x60(%rsp)
+ vmovdqu $Hkey,0x70(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups $inout0,-0x60($out) # save output
+ vmovups $inout1,-0x50($out)
+ vmovups $inout2,-0x40($out)
+ vmovups $inout3,-0x30($out)
+ vmovups $inout4,-0x20($out)
+ vmovups $inout5,-0x10($out)
+
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xd8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_dec_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type _aesni_ctr32_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ lea -1($rounds),%r13
+ vmovups 0x10-0x80($key),$rndkey
+ lea 0x20-0x80($key),%r12
+ vpxor $Z0,$T1,$inout0
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32_2
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddb $T2,$inout2,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddb $T2,$inout3,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpaddb $T2,$inout4,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpaddb $T2,$inout5,$T1
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+ vmovups (%r12),$rndkey
+ lea 0x10(%r12),%r12
+ dec %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),$Hkey # last round key
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 0x00($inp),$Hkey,$Z0
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor 0x10($inp),$Hkey,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x20($inp),$Hkey,$Z2
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x30($inp),$Hkey,$Xi
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x40($inp),$Hkey,$T2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x50($inp),$Hkey,$Hkey
+ lea 0x60($inp),$inp
+
+ vaesenclast $Z0,$inout0,$inout0
+ vaesenclast $Z1,$inout1,$inout1
+ vaesenclast $Z2,$inout2,$inout2
+ vaesenclast $Xi,$inout3,$inout3
+ vaesenclast $T2,$inout4,$inout4
+ vaesenclast $Hkey,$inout5,$inout5
+ vmovups $inout0,0x00($out)
+ vmovups $inout1,0x10($out)
+ vmovups $inout2,0x20($out)
+ vmovups $inout3,0x30($out)
+ vmovups $inout4,0x40($out)
+ vmovups $inout5,0x50($out)
+ lea 0x60($out),$out
+
+ ret
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpshufb $Ii,$inout5,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpshufb $Ii,$T1,$T1 # next counter value
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@function,6
+.align 32
+aesni_gcm_encrypt:
+ xor $ret,$ret
+ cmp \$0x60*3,$len # minimal accepted length
+ jb .Lgcm_enc_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ lea 0x80($key),$key # size optimization
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ and \$-128,%rsp # ensure stack alignment
+ mov 0xf0-0x80($key),$rounds
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Lenc_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Lenc_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+ lea ($out),$in0
+ lea -0xc0($out,$len),$end0
+ shr \$4,$len
+
+ call _aesni_ctr32_6x
+ vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
+ vpshufb $Ii,$inout1,$T2
+ vmovdqu $Xi,0x70(%rsp)
+ vpshufb $Ii,$inout2,$Z0
+ vmovdqu $T2,0x60(%rsp)
+ vpshufb $Ii,$inout3,$Z1
+ vmovdqu $Z0,0x50(%rsp)
+ vpshufb $Ii,$inout4,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu $Z2,0x30(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu ($Xip),$Xi # load Xi
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ sub \$12,$len
+ mov \$0x60*2,$ret
+ vpshufb $Ii,$Xi,$Xi
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 0x20(%rsp),$Z3 # I[5]
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $Z3,$Z3,$T1
+ vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
+ vmovups $inout0,-0x60($out) # save output
+ vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
+ vpxor $Z3,$T1,$T1
+ vmovups $inout1,-0x50($out)
+ vpshufb $Ii,$inout1,$inout1
+ vmovups $inout2,-0x40($out)
+ vpshufb $Ii,$inout2,$inout2
+ vmovups $inout3,-0x30($out)
+ vpshufb $Ii,$inout3,$inout3
+ vmovups $inout4,-0x20($out)
+ vpshufb $Ii,$inout4,$inout4
+ vmovups $inout5,-0x10($out)
+ vpshufb $Ii,$inout5,$inout5
+ vmovdqu $inout0,0x10(%rsp) # free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+ vmovdqu 0x30(%rsp),$Z2 # I[4]
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpunpckhqdq $Z2,$Z2,$T2
+ vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
+ vpxor $Z2,$T2,$T2
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+
+ vmovdqu 0x40(%rsp),$T3 # I[3]
+ vpclmulqdq \$0x00,$Ii,$Z2,$Z0
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $T3,$T3,$Z1
+ vpclmulqdq \$0x11,$Ii,$Z2,$Z2
+ vpxor $T3,$Z1,$Z1
+ vpxor $Z3,$Z2,$Z2
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vmovdqu 0x50(%rsp),$T1 # I[2]
+ vpclmulqdq \$0x00,$Hkey,$T3,$Z3
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z0,$Z3,$Z3
+ vpunpckhqdq $T1,$T1,$Z0
+ vpclmulqdq \$0x11,$Hkey,$T3,$T3
+ vpxor $T1,$Z0,$Z0
+ vpxor $Z2,$T3,$T3
+ vpclmulqdq \$0x00,$HK,$Z1,$Z1
+ vpxor $T2,$Z1,$Z1
+
+ vmovdqu 0x60(%rsp),$T2 # I[1]
+ vpclmulqdq \$0x00,$Ii,$T1,$Z2
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z3,$Z2,$Z2
+ vpunpckhqdq $T2,$T2,$Z3
+ vpclmulqdq \$0x11,$Ii,$T1,$T1
+ vpxor $T2,$Z3,$Z3
+ vpxor $T3,$T1,$T1
+ vpclmulqdq \$0x10,$HK,$Z0,$Z0
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $Z1,$Z0,$Z0
+
+ vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
+ vpclmulqdq \$0x00,$Hkey,$T2,$Z1
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpxor $Z2,$Z1,$Z1
+ vpclmulqdq \$0x11,$Hkey,$T2,$T2
+ vpxor $Xi,$T3,$T3
+ vpxor $T1,$T2,$T2
+ vpclmulqdq \$0x00,$HK,$Z3,$Z3
+ vpxor $Z0,$Z3,$Z0
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $inout5,$inout5,$T1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Xi
+ vpxor $inout5,$T1,$T1
+ vpxor $Z1,$Z2,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$T3
+ vmovdqu 0x20-0x20($Xip),$HK
+ vpxor $T2,$Xi,$Z3
+ vpxor $Z0,$T3,$Z2
+
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
+ vpxor $T3,$Z2,$Z2
+ vpunpckhqdq $inout4,$inout4,$T2
+ vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
+ vpxor $inout4,$T2,$T2
+ vpslldq \$8,$Z2,$T3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+ vpxor $T3,$Z1,$Xi
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $Z2,$Z3,$Z3
+
+ vpclmulqdq \$0x00,$Ii,$inout4,$Z1
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout3,$inout3,$T3
+ vpclmulqdq \$0x11,$Ii,$inout4,$inout4
+ vpxor $inout3,$T3,$T3
+ vpxor $inout5,$inout4,$inout4
+ vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $inout2,$inout2,$T1
+ vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
+ vpxor $inout2,$T1,$T1
+ vpxor $inout4,$inout3,$inout3
+ vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
+ vpclmulqdq \$0x00,$HK,$T3,$T3
+ vpxor $T2,$T3,$T3
+
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Ii,$inout2,$Z1
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout1,$inout1,$T2
+ vpclmulqdq \$0x11,$Ii,$inout2,$inout2
+ vpxor $inout1,$T2,$T2
+ vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
+ vpxor $inout3,$inout2,$inout2
+ vpclmulqdq \$0x10,$HK,$T1,$T1
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $T3,$T1,$T1
+
+ vxorps $Z3,$inout5,$inout5
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
+ vpxor $Xi,$T3,$T3
+ vpxor $inout2,$inout1,$inout1
+ vpclmulqdq \$0x00,$HK,$T2,$T2
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Z3
+ vpxor $Z0,$Z1,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$Z2
+ vpxor $inout1,$Z3,$Z3
+ vpxor $T2,$Z2,$Z2
+
+ vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
+ vpxor $Z0,$Z2,$Z2
+ vpslldq \$8,$Z2,$T1
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $T1,$Z1,$Xi
+ vpxor $Z2,$Z3,$Z3
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $Z3,$T2,$T2
+ vpxor $T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_enc_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+ .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type gcm_se_handler,\@abi-omnipotent
+.align 16
+gcm_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 120($context),%rax # pull context->Rax
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size gcm_se_handler,.-gcm_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_gcm_decrypt
+ .rva .LSEH_end_aesni_gcm_decrypt
+ .rva .LSEH_gcm_dec_info
+
+ .rva .LSEH_begin_aesni_gcm_encrypt
+ .rva .LSEH_end_aesni_gcm_encrypt
+ .rva .LSEH_gcm_enc_info
+.section .xdata
+.align 8
+.LSEH_gcm_dec_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___; # assembler is too old
+.text
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl
index d91586ee2..77fbf3446 100644
--- a/openssl/crypto/modes/asm/ghash-armv4.pl
+++ b/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -35,6 +35,20 @@
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,117 +317,161 @@ $code.=<<___;
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 $IN#hi,[r1,:64]! @ load H
+ vmov.i8 $t0,#0xe1
+ vld1.64 $IN#lo,[r1,:64]
+ vshl.i64 $t0#hi,#57
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
+ vdup.8 $t1,$IN#hi[7]
+ vshr.u64 $Hlo,$IN#lo,#63
+ vshr.s8 $t1,#7 @ broadcast carry bit
+ vshl.i64 $IN,$IN,#1
+ vand $t0,$t0,$t1
+ vorr $IN#hi,$Hlo @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vstmia r0,{$IN}
+
+ ret @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Htbl,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
+ vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $IN#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Qpost,$Qpost
- veor $R,$R
- mov $cnt,#16
- veor $Z,$Z
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
- veor $Zo,$Zo
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- b .Linner_neon
+ b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
- vshr.u64 $mod,#32
- vldmia $Xi,{$Hhi-$Hlo} @ load H
- veor $zero,$zero
- nop
+ vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi,:64]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
-.Louter_neon:
- vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
- veor $Qpost,$Qpost
- vld1.64 `&Dlo($IN)`,[$inp]!
- veor $R,$R
- mov $cnt,#16
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 $IN#hi,[$inp]! @ load inp
+ vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
- veor $Zo,$Zo
- veor $IN,$Z @ inp^=Xi
- veor $Z,$Z
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
-.Linner_neon:
- subs $cnt,$cnt,#1
- vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
- vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
- vext.8 $IN,$zero,#1 @ IN>>=8
-
- veor $Z,$Qpost @ modulo-scheduled part
- vshl.i64 `&Dlo("$R")`,#48
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
- veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
- veor `&Dhi("$Z")`,`&Dlo("$R")`
- vuzp.8 $Qlo,$Qhi
- vsli.8 $Zo,$T,#1 @ compose the "carry" byte
- vext.8 $Z,$zero,#1 @ Z>>=8
-
- vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
- vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
- vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
- veor $Z,$Qhi
- bne .Linner_neon
-
- veor $Z,$Qpost @ modulo-scheduled artefact
- vshl.i64 `&Dlo("$R")`,#48
- veor `&Dhi("$Z")`,`&Dlo("$R")`
-
- @ finalization, normalize Z:Zo
- vand $Zo,$mod @ suffices to mask the bit
- vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
- vshl.i64 $Z,#1
+ veor $IN,$Xl @ inp^=Xi
+.Lgmult_neon:
+___
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
+$code.=<<___;
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
+___
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
+$code.=<<___;
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
+ veor $Xm,$Xm,$Xh
+ veor $Xl#hi,$Xl#hi,$Xm#lo
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
+ vshl.i64 $t2,$Xl,#62
+ veor $t2,$t2,$t1 @
+ vshl.i64 $t1,$Xl,#63
+ veor $t2, $t2, $t1 @
+ veor $Xl#hi,$Xl#hi,$t2#lo @
+ veor $Xh#lo,$Xh#lo,$t2#hi
+
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
+ veor $Xh,$Xh,$Xl
+ veor $Xl,$Xl,$t2 @
+ vshr.u64 $t2,$t2,#6
+ vshr.u64 $Xl,$Xl,#1 @
+ veor $Xl,$Xl,$Xh @
+ veor $Xl,$Xl,$t2 @
+
subs $len,#16
- vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
- bne .Louter_neon
+ bne .Loop_neon
#ifdef __ARMEL__
- vrev64.8 $Z,$Z
+ vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi,:64]
- bx lr
+ ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
@@ -423,7 +481,13 @@ $code.=<<___;
.align 2
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghash-s390x.pl b/openssl/crypto/modes/asm/ghash-s390x.pl
index 6a40d5d89..39096b423 100644
--- a/openssl/crypto/modes/asm/ghash-s390x.pl
+++ b/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -186,13 +186,13 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
- sllg $tmp,$Zhi,60
- xg $Zlo,8($nlo,$Htbl)
srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
@@ -213,9 +213,9 @@ $code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
- sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
diff --git a/openssl/crypto/modes/asm/ghash-sparcv9.pl b/openssl/crypto/modes/asm/ghash-sparcv9.pl
index 70e7b044a..0365e0f1f 100644
--- a/openssl/crypto/modes/asm/ghash-sparcv9.pl
+++ b/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -36,6 +36,15 @@
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -66,6 +75,10 @@ $Htbl="%i1";
$inp="%i2";
$len="%i3";
+$code.=<<___ if ($bits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
$code.=<<___;
.section ".text",#alloc,#execinstr
@@ -321,10 +334,238 @@ gcm_gmult_4bit:
restore
.type gcm_gmult_4bit,#function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+ (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl gcm_init_vis3
+.align 32
+gcm_init_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [%i1+0],$Hhi
+ ldx [%i1+8],$Hlo
+ mov 0xE1,$Xhi
+ mov 1,$Xlo
+ sllx $Xhi,57,$Xhi
+ srax $Hhi,63,$C0 ! broadcast carry
+ addcc $Hlo,$Hlo,$Hlo ! H<<=1
+ addxc $Hhi,$Hhi,$Hhi
+ and $C0,$Xlo,$Xlo
+ and $C0,$Xhi,$Xhi
+ xor $Xlo,$Hlo,$Hlo
+ xor $Xhi,$Hhi,$Hhi
+ stx $Hlo,[%i0+8] ! save twisted H
+ stx $Hhi,[%i0+0]
+
+ sethi %hi(0xA0406080),$V
+ sethi %hi(0x20C0E000),%l0
+ or $V,%lo(0xA0406080),$V
+ or %l0,%lo(0x20C0E000),%l0
+ sllx $V,32,$V
+ or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
+ stx $V,[%i0+16]
+
+ ret
+ restore
+.type gcm_init_vis3,#function
+.size gcm_init_vis3,.-gcm_init_vis3
+
+.globl gcm_gmult_vis3
+.align 32
+gcm_gmult_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$Xlo ! load Xi
+ ldx [$Xip+0],$Xhi
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_gmult_vis3,#function
+.size gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl gcm_ghash_vis3
+.align 32
+gcm_ghash_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$C2 ! load Xi
+ ldx [$Xip+0],$C3
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ and $inp,7,$shl
+ andn $inp,7,$inp
+ sll $shl,3,$shl
+ prefetch [$inp+63], 20
+ sub %g0,$shl,$shr
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+.Loop:
+ ldx [$inp+8],$Xlo
+ brz,pt $shl,1f
+ ldx [$inp+0],$Xhi
+
+ ldx [$inp+16],$C1 ! align data
+ srlx $Xlo,$shr,$C0
+ sllx $Xlo,$shl,$Xlo
+ sllx $Xhi,$shl,$Xhi
+ srlx $C1,$shr,$C1
+ or $C0,$Xhi,$Xhi
+ or $C1,$Xlo,$Xlo
+1:
+ add $inp,16,$inp
+ sub $len,16,$len
+ xor $C2,$Xlo,$Xlo
+ xor $C3,$Xhi,$Xhi
+ prefetch [$inp+63], 20
+
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ brnz,pt $len,.Loop
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_ghash_vis3,#function
+.size gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "xmulx" => 0x115,
+ "xmulxhi" => 0x116 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
close STDOUT;
diff --git a/openssl/crypto/modes/asm/ghash-x86.pl b/openssl/crypto/modes/asm/ghash-x86.pl
index 83c727e07..23a5527b3 100644
--- a/openssl/crypto/modes/asm/ghash-x86.pl
+++ b/openssl/crypto/modes/asm/ghash-x86.pl
@@ -12,25 +12,27 @@
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
-# gcc 2.95.3(*) MMX assembler x86 assembler
+# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
-# comparison with MMX results is not completely fair, because C
+# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
@@ -40,8 +42,8 @@
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
# May 2010
#
@@ -113,6 +115,16 @@
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -822,17 +834,18 @@ $len="ebx";
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
- &pshufd ($T2,$Hkey,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
- &pxor ($T2,$Hkey);
+ &pxor ($T2,$Hkey) if (!defined($HK));
+ $HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pclmulqdq ($T1,$T2,0x00); #######
+ &pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
@@ -879,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist.
# below. Algorithm 9 was therefore chosen for
# further optimization...
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
- &movdqa ($T1,$Xi); #
+ &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2); #
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
+ &psrlq ($Xi,1);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
+ &pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
+ &pshufd ($T1,$Hkey,0b01001110);
+ &pshufd ($T2,$Xi,0b01001110);
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
+ &movups ($T2,&QWP(32,$Htbl));
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
@@ -994,79 +1015,109 @@ my ($Xhi,$Xi) = @_;
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
+ &movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
- &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
+ &jmp (&label("mod_loop"));
-&set_label("mod_loop");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("mod_loop",32);
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
+ &nop ();
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
+ &xorps ($Xhi,$Xhn);
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pxor ($T1,$Xhi); #
- &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
- &movdqa ($Xhn,$Xn);
- &pxor ($Xhi,$T1); # "Ii+Xi", consume early
+ &pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
- &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+ &pshufb ($Xn,$T3);
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
+
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pshufd ($T1,$T3,0b01001110);
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+ &pshufd ($T1,$Xhn,0b01001110);
+ &movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,1);
+ &pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
- &pxor ($T1,$T3);
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T3,$Hkey); #
-
&pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movdqa ($T2,$Xi); # 2nd phase
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-
+ &pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &xorps ($T1,$Xn); #
- &xorps ($T1,$Xhn); #
-
- &movdqa ($T3,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T3,8); #
- &pxor ($Xhn,$T1);
- &pxor ($Xn,$T3); #
- &movdqa ($T3,&QWP(0,$const));
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &pxor ($T1,$Xhi); #
+
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
@@ -1273,13 +1324,6 @@ my ($Xhi,$Xi)=@_;
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}} # $sse2
-
-&set_label("rem_4bit",64);
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
&set_label("rem_8bit",64);
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1357,13 @@ my ($Xhi,$Xi)=@_;
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/openssl/crypto/modes/asm/ghash-x86_64.pl b/openssl/crypto/modes/asm/ghash-x86_64.pl
index 38d779edb..6e656ca13 100644
--- a/openssl/crypto/modes/asm/ghash-x86_64.pl
+++ b/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
# P4 28.6 14.0 +100%
# Opteron 19.3 7.7 +150%
# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
#
# (*) comparison is not completely fair, because C results are
# for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,44 @@
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,9 +90,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
+$do4xaggr=1;
+
# common register layout
$nlo="%rax";
$nhi="%rbx";
@@ -160,6 +221,7 @@ ___
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
@@ -352,19 +414,27 @@ ___
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
sub clmul64x64_T2 { # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
+} else {
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1 #
+___
+}
$code.=<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
- pclmulqdq \$0x00,$T2,$T1 #######
+ pclmulqdq \$0x00,$HK,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
@@ -376,42 +446,53 @@ $code.=<<___;
___
}
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
$code.=<<___;
# 1st phase
- movdqa $Xi,$T1 #
+ movdqa $Xi,$T2 #
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
# 2nd phase
movdqa $Xi,$T2
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pxor $Xhi,$Xi #
___
}
{ my ($Htbl,$Xip)=@_4args;
+ my $HK="%xmm6";
$code.=<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
@@ -430,13 +511,47 @@ gcm_init_clmul:
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
+ pshufd \$0b01001110,$Hkey,$HK
movdqa $Hkey,$Xi
+ pxor $Hkey,$HK
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
- movdqu $Hkey,($Htbl) # save H
- movdqu $Xi,16($Htbl) # save H^2
+ pshufd \$0b01001110,$Hkey,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $Hkey,$T1 # Karatsuba pre-processing
+ movdqu $Hkey,0x00($Htbl) # save H
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x10($Htbl) # save H^2
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
+___
+if ($do4xaggr) {
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqa $Xi,$T3
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$T3,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $T3,$T1 # Karatsuba pre-processing
+ movdqu $T3,0x30($Htbl) # save H^3
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x40($Htbl) # save H^4
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
@@ -449,13 +564,38 @@ $code.=<<___;
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$T2
pshufb $T3,$Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+ # experimental alternative. special thing about is that there
+ # no dependency between the two multiplications...
+ mov \$`0xE1<<1`,%eax
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0x07,%r11d
+ movq %rax,$T1
+ movq %r10,$T2
+ movq %r11,$T3 # borrow $T3
+ pand $Xi,$T3
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
+ movq %rax,$T3
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pxor $Xi,$T2
+ pslldq \$15,$T2
+ paddd $T2,$T2 # <<(64+56+1)
+ pxor $T2,$Xi
+ pclmulqdq \$0x01,$T3,$Xi
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
+ psrldq \$1,$T1
+ pxor $T1,$Xhi
+ pslldq \$7,$Xi
+ pxor $Xhi,$Xi
+___
$code.=<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
@@ -465,129 +605,327 @@ ___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my $Xn="%xmm6";
- my $Xhn="%xmm7";
- my $Hkey2="%xmm8";
- my $T1n="%xmm9";
- my $T2n="%xmm10";
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
-.align 16
+.align 32
gcm_ghash_clmul:
+.L_ghash_clmul:
___
$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$HK
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
- movdqu 16($Htbl),$Hkey2
+ movdqu 0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
+ cmp \$0x30,$len
+ jb .Lskip4x
+
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
+ sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
+ movdqu 0x30($Htbl),$Hkey3
+ movdqu 0x40($Htbl),$Hkey4
+
+ #######
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+ #
+ movdqu 0x30($inp),$Xln
+ movdqu 0x20($inp),$Xl
+ pshufb $T3,$Xln
+ pshufb $T3,$Xl
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey2,$Xl
+ pclmulqdq \$0x11,$Hkey2,$Xh
+ pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ xorps $Xm,$Xmn
+
+ movdqu 0x10($inp),$Xl
+ movdqu 0($inp),$T1
+ pshufb $T3,$Xl
+ pshufb $T3,$T1
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T1,$Xi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ movdqu 0x30($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ movdqu 0x20($inp),$Xln
+ movdqa $Xl,$Xh
+ pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
+ xorps $Xhn,$Xhi
+ pxor $Xl,$Xm
+ pshufb $T3,$Xln
+ movups 0x20($Htbl),$HK
+ xorps $Xmn,$T1
+ pclmulqdq \$0x00,$Hkey,$Xl
+ pshufd \$0b01001110,$Xln,$Xmn
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ movdqa $Xln,$Xhn
+ pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
+ movdqa $T1,$T2 #
+ pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ movdqa .L7_mask(%rip),$T1
+ pxor $T2,$Xhi #
+ movq %rax,$T2
+
+ pand $Xi,$T1 # 1st phase
+ pshufb $T1,$T2 #
+ pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
+ psllq \$57,$T2 #
+ movdqa $T2,$T1 #
+ pslldq \$8,$T2
+ pclmulqdq \$0x00,$Hkey2,$Xln
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ movdqu 0($inp),$T1
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhn
+ xorps $Xl,$Xln
+ movdqu 0x10($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x10,$HK,$Xmn
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ pshufb $T3,$T1
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+
+ movdqa $Xl,$Xh
+ pxor $Xm,$Xmn
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T2,$Xi #
+ pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jnc .Lmod4_loop
+
+.Ltail4x:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
+ xorps $Xhn,$Xhi
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
+ pxor $Xmn,$T1
+
+ pxor $Xhi,$T1 #
+ pxor $Xi,$Xhi
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+ &reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+ add \$0x40,$len
+ jz .Ldone
+ movdqu 0x20($Htbl),$HK
+ sub \$0x10,$len
+ jz .Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
- movdqu 16($inp),$Xn # Ii+1
+ movdqu 16($inp),$Xln # Ii+1
pshufb $T3,$T1
- pshufb $T3,$Xn
+ pshufb $T3,$Xln
pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
-$code.=<<___;
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
+
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+.align 32
.Lmod_loop:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- movdqu ($inp),$T1 # Ii
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
- movdqa $Xn,$Xhn #
- pshufd \$0b01001110,$Xn,$T1n
- pshufd \$0b01001110,$Hkey,$T2n
- pxor $Xn,$T1n #
- pxor $Hkey,$T2n
- pxor $T1,$Xhi # "Ii+Xi", consume early
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
+ movdqu 16($inp),$Xln # Ii+1
+
+ pxor $Xhi,$T1
+ pxor $T2,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$Xmn
+ pshufb $T3,$Xln
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
+
+ movdqa $Xln,$Xhn #
- movdqa $Xi,$T1 # 1st phase
+ movdqa $Xi,$T2 # 1st phase
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
+ pxor $T1,$Xhi #
+ pxor $Xhn,$Xmn #
- pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pclmulqdq \$0x00,$HK,$Xmn #######
+ pxor $Xhi,$Xi #
- pclmulqdq \$0x00,$T2n,$T1n #######
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- pxor $Xn,$T1n #
- pxor $Xhn,$T1n #
- movdqa $T1n,$T2n #
- psrldq \$8,$T1n
- pslldq \$8,$T2n #
- pxor $T1n,$Xhn
- pxor $T2n,$Xn #
-
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
+ pxor $Xi,$T1
+ pxor $Xhi,$T1
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
@@ -599,7 +937,7 @@ $code.=<<___;
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
.Ldone:
@@ -612,21 +950,607 @@ $code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
- add \$0x58,%rsp
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
___
$code.=<<___;
ret
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
+
+$code.=<<___;
+.globl gcm_init_avx
+.type gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Hkey
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ vpsrlq \$63,$Hkey,$T1
+ vpsllq \$1,$Hkey,$Hkey
+ vpxor $T3,$T3,$T3 #
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
+ vpslldq \$8,$T1,$T1
+ vpor $T1,$Hkey,$Hkey # H<<=1
+
+ # magic reduction
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ vpunpckhqdq $Hkey,$Hkey,$HK
+ vmovdqa $Hkey,$Xi
+ vpxor $Hkey,$HK,$HK
+ mov \$4,%r10 # up to H^8
+ jmp .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpunpckhqdq $Hkey,$Hkey,$T2
+ vpxor $Xi,$T1,$T1 #
+ vpxor $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpxor $Xi,$T1,$T1 #
+___
+}
+$code.=<<___;
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
+ vpxor $Xi,$Xhi,$T2 #
+ vpxor $T2,$T1,$T1 #
+
+ vpslldq \$8,$T1,$T2 #
+ vpsrldq \$8,$T1,$T1
+ vpxor $T2,$Xi,$Xi #
+ vpxor $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ vpsllq \$57,$Xi,$T1 # 1st phase
+ vpsllq \$62,$Xi,$T2
+ vpxor $T1,$T2,$T2 #
+ vpsllq \$63,$Xi,$T1
+ vpxor $T1,$T2,$T2 #
+ vpslldq \$8,$T2,$T1 #
+ vpsrldq \$8,$T2,$T2
+ vpxor $T1,$Xi,$Xi #
+ vpxor $T2,$Xhi,$Xhi
+
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
+ vpxor $Xi,$Xhi,$Xhi
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$5,$T2,$T2
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$1,$Xi,$Xi #
+ vpxor $Xhi,$Xi,$Xi #
+___
+}
$code.=<<___;
+.align 32
+.Linit_loop_avx:
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+ vmovdqa $Xi,$T3
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+ vpshufd \$0b01001110,$T3,$T1
+ vpshufd \$0b01001110,$Xi,$T2
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
+ lea 0x30($Htbl),$Htbl
+ sub \$1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
+ vmovdqu $T3,-0x10($Htbl)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+ $Zlo,$Zhi,$Zmi,
+ $Hkey,$HK,$T1,$T2,
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Xi # load $Xi
+ lea .L0x1c2_polynomial(%rip),%r10
+ lea 0x40($Htbl),$Htbl # size optimization
+ vmovdqu .Lbswap_mask(%rip),$bswap
+ vpshufb $bswap,$Xi,$Xi
+ cmp \$0x80,$len
+ jb .Lshort_avx
+ sub \$0x80,$len
+
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpshufb $bswap,$Ii,$Ii
+ vmovdqu 0x20-0x40($Htbl),$HK
+
+ vpunpckhqdq $Ii,$Ii,$T2
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Ii,$T2,$T2
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpxor $Ii,$T2,$T2
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpxor $Xmi,$Zmi,$Zmi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
+
+ lea 0x80($inp),$inp
+ cmp \$0x80,$len
+ jb .Ltail_avx
+
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+ sub \$0x80,$len
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpxor $Ij,$T1,$T1
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Zlo,$Xi,$Xi # collect result
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vxorps $Zhi,$Xo,$Xo
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Zmi,$Tred,$Tred
+ vxorps $Ij,$T1,$T1
+
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Xo,$Tred,$Tred
+ vpslldq \$8,$Tred,$T2
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vpsrldq \$8,$Tred,$Tred
+ vpxor $T2, $Xi, $Xi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ii
+ vxorps $Tred,$Xo, $Xo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vxorps $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+ vxorps $Tred,$Xi,$Xi
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vxorps $Xo,$Tred,$Tred
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Tred,$Ij,$Ij
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jnc .Loop8x_avx
+
+ add \$0x80,$len
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -0x10($inp,$len),$Ii # very last word
+ lea ($inp,$len),$inp
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpshufb $bswap,$Ii,$Ij
+
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
+ vmovdqa $Xhi,$Zhi # $Zhi and
+ vmovdqa $Xmi,$Zmi # $Zmi
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x20($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x30($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x40($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x50($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x60($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x70($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovq 0xb8-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+.Ltail_no_xor_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+
+ vmovdqu (%r10),$Tred
+
+ vpxor $Xlo,$Zlo,$Xi
+ vpxor $Xhi,$Zhi,$Xo
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
+ vpxor $Xo, $Zmi,$Zmi
+ vpslldq \$8, $Zmi,$T2
+ vpsrldq \$8, $Zmi,$Zmi
+ vpxor $T2, $Xi, $Xi
+ vpxor $Zmi,$Xo, $Xo
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $Xo,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ cmp \$0,$len
+ jne .Lshort_avx
+
+ vpshufb $bswap,$Xi,$Xi
+ vmovdqu $Xi,($Xip)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+ .long 7,0,7,0
+.L7_mask_poly:
+ .long 7,0,`0xE1<<1`,0
.align 64
.type .Lrem_4bit,\@object
.Lrem_4bit:
@@ -774,10 +1698,24 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+ .rva .LSEH_begin_gcm_init_clmul
+ .rva .LSEH_end_gcm_init_clmul
+ .rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_gcm_init_avx
+ .rva .LSEH_end_gcm_init_avx
+ .rva .LSEH_info_gcm_init_clmul
+ .rva .LSEH_begin_gcm_ghash_avx
+ .rva .LSEH_end_gcm_ghash_avx
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_gcm_gmult_4bit:
@@ -788,14 +1726,23 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_init_clmul:
+ .byte 0x01,0x08,0x03,0x00
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
.LSEH_info_gcm_ghash_clmul:
- .byte 0x01,0x1f,0x0b,0x00
- .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
- .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
___
}
diff --git a/openssl/crypto/modes/asm/ghashp8-ppc.pl b/openssl/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 000000000..e76a58c34
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my $vrsave="r12";
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p8
+.align 5
+.gcm_init_p8:
+ lis r0,0xfff0
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $H,$H,$t1 # twisted H
+
+ vsldoi $H,$H,$H,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ stvx_u $H, r9,r3
+ stvx_u $Hh,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p8,.-.gcm_init_p8
+
+.globl .gcm_gmult_p8
+.align 5
+.gcm_gmult_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl .gcm_ghash_p8
+.align 5
+.gcm_ghash_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subi $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ b Loop
+
+.align 5
+Loop:
+ subic $len,$len,16
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ subfe. r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+ add $inp,$inp,r0
+
+ vpmsumd $t2,$Xl,$xC2 # 1st phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ beq Loop # did $len-=16 borrow?
+
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/asm/ghashv8-armx.pl b/openssl/crypto/modes/asm/ghashv8-armx.pl
new file mode 100755
index 000000000..54a1ac4db
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+# PMULL[2] 32-bit NEON(*)
+# Apple A7 1.76 5.62
+# Cortex-A53 1.45 8.39
+# Cortex-A57 2.22 7.61
+#
+# (*) presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0"; # argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
+
+$code.=<<___;
+.global gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ vld1.64 {$t1},[x1] @ load H
+ vmov.i8 $t0,#0xe1
+ vext.8 $IN,$t1,$t1,#8
+ vshl.i64 $t0,$t0,#57
+ vshr.u64 $t2,$t0,#63
+ vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
+ vdup.32 $t1,${t1}[1]
+ vshr.u64 $t3,$IN,#63
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
+ vand $t3,$t3,$t0
+ vshl.i64 $IN,$IN,#1
+ vext.8 $t3,$t3,$t3,#8
+ vand $t0,$t0,$t1
+ vorr $IN,$IN,$t3 @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vst1.64 {$IN},[x0]
+
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+
+.global gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $t3,#0xe1
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ vshl.u64 $t3,$t3,#57
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $Hhl,$H,$H,#8
+ mov $len,#0
+ vext.8 $IN,$t1,$t1,#8
+ mov $inc,#0
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ mov $inp,$Xi
+ b .Lgmult_v8
+.size gcm_gmult_v8,.-gcm_gmult_v8
+
+.global gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ subs $len,$len,#16
+ vmov.i8 $t3,#0xe1
+ mov $inc,#16
+ vld1.64 {$H},[$Htbl] @ load twisted H
+ cclr $inc,eq
+ vext.8 $Xl,$Xl,$Xl,#8
+ vshl.u64 $t3,$t3,#57
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ vext.8 $Hhl,$H,$H,#8
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+ vrev64.8 $t1,$t1
+#endif
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
+ vext.8 $IN,$t1,$t1,#8
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vext.8 $t2,$Xl,$Xl,#8
+ veor $IN,$IN,$Xl @ inp^=Xi
+ veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ subs $len,$len,#16
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ cclr $inc,eq
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$t3 @ 1st phase
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ veor $Xl,$Xm,$t2
+ vext.8 $IN,$t1,$t1,#8
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vpmull.p64 $Xl,$Xl,$t3
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+ b.hs .Loop_v8
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vmov\s+(.*)/unvmov($1)/geo or
+ s/vext\.8/ext/o or
+ s/vshr\.s/sshr\.s/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8(\s)/$1/o;
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl/crypto/modes/cbc128.c b/openssl/crypto/modes/cbc128.c
index 0e54f7547..c13caea53 100644
--- a/openssl/crypto/modes/cbc128.c
+++ b/openssl/crypto/modes/cbc128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -59,147 +59,149 @@
#endif
#include <assert.h>
-#ifndef STRICT_ALIGNMENT
-# define STRICT_ALIGNMENT 0
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
+# define STRICT_ALIGNMENT 0
#endif
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block)
{
- size_t n;
- const unsigned char *iv = ivec;
+ size_t n;
+ const unsigned char *iv = ivec;
- assert(in && out && key && ivec);
+ assert(in && out && key && ivec);
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- while (len>=16) {
- for(n=0; n<16; ++n)
- out[n] = in[n] ^ iv[n];
- (*block)(out, out, key);
- iv = out;
- len -= 16;
- in += 16;
- out += 16;
- }
- } else {
- while (len>=16) {
- for(n=0; n<16; n+=sizeof(size_t))
- *(size_t*)(out+n) =
- *(size_t*)(in+n) ^ *(size_t*)(iv+n);
- (*block)(out, out, key);
- iv = out;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ while (len >= 16) {
+ for (n = 0; n < 16; ++n)
+ out[n] = in[n] ^ iv[n];
+ (*block) (out, out, key);
+ iv = out;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else {
+ while (len >= 16) {
+ for (n = 0; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+ (*block) (out, out, key);
+ iv = out;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
#endif
- while (len) {
- for(n=0; n<16 && n<len; ++n)
- out[n] = in[n] ^ iv[n];
- for(; n<16; ++n)
- out[n] = iv[n];
- (*block)(out, out, key);
- iv = out;
- if (len<=16) break;
- len -= 16;
- in += 16;
- out += 16;
- }
- memcpy(ivec,iv,16);
+ while (len) {
+ for (n = 0; n < 16 && n < len; ++n)
+ out[n] = in[n] ^ iv[n];
+ for (; n < 16; ++n)
+ out[n] = iv[n];
+ (*block) (out, out, key);
+ iv = out;
+ if (len <= 16)
+ break;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ memcpy(ivec, iv, 16);
}
void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block)
{
- size_t n;
- union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
+ size_t n;
+ union {
+ size_t t[16 / sizeof(size_t)];
+ unsigned char c[16];
+ } tmp;
- assert(in && out && key && ivec);
+ assert(in && out && key && ivec);
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (in != out) {
- const unsigned char *iv = ivec;
+ if (in != out) {
+ const unsigned char *iv = ivec;
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- while (len>=16) {
- (*block)(in, out, key);
- for(n=0; n<16; ++n)
- out[n] ^= iv[n];
- iv = in;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- else if (16%sizeof(size_t) == 0) { /* always true */
- while (len>=16) {
- size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ while (len >= 16) {
+ (*block) (in, out, key);
+ for (n = 0; n < 16; ++n)
+ out[n] ^= iv[n];
+ iv = in;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else if (16 % sizeof(size_t) == 0) { /* always true */
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
- (*block)(in, out, key);
- for(n=0; n<16/sizeof(size_t); n++)
- out_t[n] ^= iv_t[n];
- iv = in;
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- memcpy(ivec,iv,16);
- } else {
- if (STRICT_ALIGNMENT &&
- ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
- unsigned char c;
- while (len>=16) {
- (*block)(in, tmp.c, key);
- for(n=0; n<16; ++n) {
- c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = c;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- else if (16%sizeof(size_t) == 0) { /* always true */
- while (len>=16) {
- size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
- const size_t *in_t=(const size_t *)in;
+ (*block) (in, out, key);
+ for (n = 0; n < 16 / sizeof(size_t); n++)
+ out_t[n] ^= iv_t[n];
+ iv = in;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
+ memcpy(ivec, iv, 16);
+ } else {
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ unsigned char c;
+ while (len >= 16) {
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16; ++n) {
+ c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else if (16 % sizeof(size_t) == 0) { /* always true */
+ while (len >= 16) {
+ size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
+ const size_t *in_t = (const size_t *)in;
- (*block)(in, tmp.c, key);
- for(n=0; n<16/sizeof(size_t); n++) {
- c = in_t[n];
- out_t[n] = tmp.t[n] ^ ivec_t[n];
- ivec_t[n] = c;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
- }
- }
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16 / sizeof(size_t); n++) {
+ c = in_t[n];
+ out_t[n] = tmp.t[n] ^ ivec_t[n];
+ ivec_t[n] = c;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
+ }
#endif
- while (len) {
- unsigned char c;
- (*block)(in, tmp.c, key);
- for(n=0; n<16 && n<len; ++n) {
- c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = c;
- }
- if (len<=16) {
- for (; n<16; ++n)
- ivec[n] = in[n];
- break;
- }
- len -= 16;
- in += 16;
- out += 16;
- }
+ while (len) {
+ unsigned char c;
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16 && n < len; ++n) {
+ c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ if (len <= 16) {
+ for (; n < 16; ++n)
+ ivec[n] = in[n];
+ break;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
}
diff --git a/openssl/crypto/modes/ccm128.c b/openssl/crypto/modes/ccm128.c
index 3ce11d0d9..c1ded0f91 100644
--- a/openssl/crypto/modes/ccm128.c
+++ b/openssl/crypto/modes/ccm128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -58,384 +58,422 @@
#endif
#include <assert.h>
-/* First you setup M and L parameters and pass the key schedule.
- * This is called once per session setup... */
+/*
+ * First you setup M and L parameters and pass the key schedule. This is
+ * called once per session setup...
+ */
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
- unsigned int M,unsigned int L,void *key,block128_f block)
+ unsigned int M, unsigned int L, void *key,
+ block128_f block)
{
- memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
- ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
- ctx->blocks = 0;
- ctx->block = block;
- ctx->key = key;
+ memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
+ ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
+ ctx->blocks = 0;
+ ctx->block = block;
+ ctx->key = key;
}
/* !!! Following interfaces are to be called *once* per packet !!! */
/* Then you setup per-message nonce and pass the length of the message */
int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
- const unsigned char *nonce,size_t nlen,size_t mlen)
+ const unsigned char *nonce, size_t nlen, size_t mlen)
{
- unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
+ unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
- if (nlen<(14-L)) return -1; /* nonce is too short */
+ if (nlen < (14 - L))
+ return -1; /* nonce is too short */
- if (sizeof(mlen)==8 && L>=3) {
- ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
- ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
- ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
- ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
- }
- else
- ctx->nonce.u[1] = 0;
+ if (sizeof(mlen) == 8 && L >= 3) {
+ ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
+ } else
+ ctx->nonce.u[1] = 0;
- ctx->nonce.c[12] = (u8)(mlen>>24);
- ctx->nonce.c[13] = (u8)(mlen>>16);
- ctx->nonce.c[14] = (u8)(mlen>>8);
- ctx->nonce.c[15] = (u8)mlen;
+ ctx->nonce.c[12] = (u8)(mlen >> 24);
+ ctx->nonce.c[13] = (u8)(mlen >> 16);
+ ctx->nonce.c[14] = (u8)(mlen >> 8);
+ ctx->nonce.c[15] = (u8)mlen;
- ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
- memcpy(&ctx->nonce.c[1],nonce,14-L);
+ ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
+ memcpy(&ctx->nonce.c[1], nonce, 14 - L);
- return 0;
+ return 0;
}
/* Then you pass additional authentication data, this is optional */
void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
- const unsigned char *aad,size_t alen)
-{ unsigned int i;
- block128_f block = ctx->block;
-
- if (alen==0) return;
-
- ctx->nonce.c[0] |= 0x40; /* set Adata flag */
- (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
- ctx->blocks++;
-
- if (alen<(0x10000-0x100)) {
- ctx->cmac.c[0] ^= (u8)(alen>>8);
- ctx->cmac.c[1] ^= (u8)alen;
- i=2;
- }
- else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
- ctx->cmac.c[0] ^= 0xFF;
- ctx->cmac.c[1] ^= 0xFF;
- ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
- ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
- ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
- ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
- ctx->cmac.c[6] ^= (u8)(alen>>24);
- ctx->cmac.c[7] ^= (u8)(alen>>16);
- ctx->cmac.c[8] ^= (u8)(alen>>8);
- ctx->cmac.c[9] ^= (u8)alen;
- i=10;
- }
- else {
- ctx->cmac.c[0] ^= 0xFF;
- ctx->cmac.c[1] ^= 0xFE;
- ctx->cmac.c[2] ^= (u8)(alen>>24);
- ctx->cmac.c[3] ^= (u8)(alen>>16);
- ctx->cmac.c[4] ^= (u8)(alen>>8);
- ctx->cmac.c[5] ^= (u8)alen;
- i=6;
- }
-
- do {
- for(;i<16 && alen;++i,++aad,--alen)
- ctx->cmac.c[i] ^= *aad;
- (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
- ctx->blocks++;
- i=0;
- } while (alen);
+ const unsigned char *aad, size_t alen)
+{
+ unsigned int i;
+ block128_f block = ctx->block;
+
+ if (alen == 0)
+ return;
+
+ ctx->nonce.c[0] |= 0x40; /* set Adata flag */
+ (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+
+ if (alen < (0x10000 - 0x100)) {
+ ctx->cmac.c[0] ^= (u8)(alen >> 8);
+ ctx->cmac.c[1] ^= (u8)alen;
+ i = 2;
+ } else if (sizeof(alen) == 8
+ && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFF;
+ ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
+ ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
+ ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
+ ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
+ ctx->cmac.c[6] ^= (u8)(alen >> 24);
+ ctx->cmac.c[7] ^= (u8)(alen >> 16);
+ ctx->cmac.c[8] ^= (u8)(alen >> 8);
+ ctx->cmac.c[9] ^= (u8)alen;
+ i = 10;
+ } else {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFE;
+ ctx->cmac.c[2] ^= (u8)(alen >> 24);
+ ctx->cmac.c[3] ^= (u8)(alen >> 16);
+ ctx->cmac.c[4] ^= (u8)(alen >> 8);
+ ctx->cmac.c[5] ^= (u8)alen;
+ i = 6;
+ }
+
+ do {
+ for (; i < 16 && alen; ++i, ++aad, --alen)
+ ctx->cmac.c[i] ^= *aad;
+ (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+ i = 0;
+ } while (alen);
}
/* Finally you encrypt or decrypt the message */
-/* counter part of nonce may not be larger than L*8 bits,
- * L is not larger than 8, therefore 64-bit counter... */
-static void ctr64_inc(unsigned char *counter) {
- unsigned int n=8;
- u8 c;
-
- counter += 8;
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
+/*
+ * counter part of nonce may not be larger than L*8 bits, L is not larger
+ * than 8, therefore 64-bit counter...
+ */
+static void ctr64_inc(unsigned char *counter)
+{
+ unsigned int n = 8;
+ u8 c;
+
+ counter += 8;
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c)
+ return;
+ } while (n);
}
int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len)
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key),
- ctx->blocks++;
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1; /* length mismatch */
-
- ctx->blocks += ((len+15)>>3)|1;
- if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-
- while (len>=16) {
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1; /* length mismatch */
+
+ ctx->blocks += ((len + 15) >> 3) | 1;
+ if (ctx->blocks > (U64(1) << 61))
+ return -2; /* too much data */
+
+ while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
- union { u64 u[2]; u8 c[16]; } temp;
-
- memcpy (temp.c,inp,16);
- ctx->cmac.u[0] ^= temp.u[0];
- ctx->cmac.u[1] ^= temp.u[1];
+ union {
+ u64 u[2];
+ u8 c[16];
+ } temp;
+
+ memcpy(temp.c, inp, 16);
+ ctx->cmac.u[0] ^= temp.u[0];
+ ctx->cmac.u[1] ^= temp.u[1];
#else
- ctx->cmac.u[0] ^= ((u64*)inp)[0];
- ctx->cmac.u[1] ^= ((u64*)inp)[1];
+ ctx->cmac.u[0] ^= ((u64 *)inp)[0];
+ ctx->cmac.u[1] ^= ((u64 *)inp)[1];
#endif
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- ctr64_inc(ctx->nonce.c);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctr64_inc(ctx->nonce.c);
#if defined(STRICT_ALIGNMENT)
- temp.u[0] ^= scratch.u[0];
- temp.u[1] ^= scratch.u[1];
- memcpy(out,temp.c,16);
+ temp.u[0] ^= scratch.u[0];
+ temp.u[1] ^= scratch.u[1];
+ memcpy(out, temp.c, 16);
#else
- ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
- ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+ ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
+ ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
#endif
- inp += 16;
- out += 16;
- len -= 16;
- }
-
- if (len) {
- for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= inp[i];
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ out[i] = scratch.c[i] ^ inp[i];
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
}
int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len)
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key);
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1;
-
- while (len>=16) {
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1;
+
+ while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
- union { u64 u[2]; u8 c[16]; } temp;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } temp;
#endif
- (*block)(ctx->nonce.c,scratch.c,key);
- ctr64_inc(ctx->nonce.c);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctr64_inc(ctx->nonce.c);
#if defined(STRICT_ALIGNMENT)
- memcpy (temp.c,inp,16);
- ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
- ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
- memcpy (out,scratch.c,16);
+ memcpy(temp.c, inp, 16);
+ ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+ ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+ memcpy(out, scratch.c, 16);
#else
- ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
- ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+ ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]);
+ ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]);
#endif
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
- inp += 16;
- out += 16;
- len -= 16;
- }
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
- if (len) {
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i)
- ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- }
+ if (len) {
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ }
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
- ctx->nonce.c[0] = flags0;
+ ctx->nonce.c[0] = flags0;
- return 0;
+ return 0;
}
-static void ctr64_add (unsigned char *counter,size_t inc)
-{ size_t n=8, val=0;
-
- counter += 8;
- do {
- --n;
- val += counter[n] + (inc&0xff);
- counter[n] = (unsigned char)val;
- val >>= 8; /* carry bit */
- inc >>= 8;
- } while(n && (inc || val));
+static void ctr64_add(unsigned char *counter, size_t inc)
+{
+ size_t n = 8, val = 0;
+
+ counter += 8;
+ do {
+ --n;
+ val += counter[n] + (inc & 0xff);
+ counter[n] = (unsigned char)val;
+ val >>= 8; /* carry bit */
+ inc >>= 8;
+ } while (n && (inc || val));
}
int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len,ccm128_f stream)
+ const unsigned char *inp, unsigned char *out,
+ size_t len, ccm128_f stream)
{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key),
- ctx->blocks++;
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1; /* length mismatch */
-
- ctx->blocks += ((len+15)>>3)|1;
- if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-
- if ((n=len/16)) {
- (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
- n *= 16;
- inp += n;
- out += n;
- len -= n;
- if (len) ctr64_add(ctx->nonce.c,n/16);
- }
-
- if (len) {
- for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1; /* length mismatch */
+
+ ctx->blocks += ((len + 15) >> 3) | 1;
+ if (ctx->blocks > (U64(1) << 61))
+ return -2; /* too much data */
+
+ if ((n = len / 16)) {
+ (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len)
+ ctr64_add(ctx->nonce.c, n / 16);
+ }
+
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= inp[i];
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ out[i] = scratch.c[i] ^ inp[i];
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
}
int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out,
- size_t len,ccm128_f stream)
+ const unsigned char *inp, unsigned char *out,
+ size_t len, ccm128_f stream)
{
- size_t n;
- unsigned int i,L;
- unsigned char flags0 = ctx->nonce.c[0];
- block128_f block = ctx->block;
- void * key = ctx->key;
- union { u64 u[2]; u8 c[16]; } scratch;
-
- if (!(flags0&0x40))
- (*block)(ctx->nonce.c,ctx->cmac.c,key);
-
- ctx->nonce.c[0] = L = flags0&7;
- for (n=0,i=15-L;i<15;++i) {
- n |= ctx->nonce.c[i];
- ctx->nonce.c[i]=0;
- n <<= 8;
- }
- n |= ctx->nonce.c[15]; /* reconstructed length */
- ctx->nonce.c[15]=1;
-
- if (n!=len) return -1;
-
- if ((n=len/16)) {
- (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
- n *= 16;
- inp += n;
- out += n;
- len -= n;
- if (len) ctr64_add(ctx->nonce.c,n/16);
- }
-
- if (len) {
- (*block)(ctx->nonce.c,scratch.c,key);
- for (i=0; i<len; ++i)
- ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
- (*block)(ctx->cmac.c,ctx->cmac.c,key);
- }
-
- for (i=15-L;i<16;++i)
- ctx->nonce.c[i]=0;
-
- (*block)(ctx->nonce.c,scratch.c,key);
- ctx->cmac.u[0] ^= scratch.u[0];
- ctx->cmac.u[1] ^= scratch.u[1];
-
- ctx->nonce.c[0] = flags0;
-
- return 0;
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1;
+
+ if ((n = len / 16)) {
+ (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len)
+ ctr64_add(ctx->nonce.c, n / 16);
+ }
+
+ if (len) {
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
}
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
-{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
-
- M *= 2; M += 2;
- if (len<M) return 0;
- memcpy(tag,ctx->cmac.c,M);
- return M;
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+ unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
+
+ M *= 2;
+ M += 2;
+ if (len < M)
+ return 0;
+ memcpy(tag, ctx->cmac.c, M);
+ return M;
}
diff --git a/openssl/crypto/modes/cfb128.c b/openssl/crypto/modes/cfb128.c
index 4e6f5d35e..d4ecbd08e 100644
--- a/openssl/crypto/modes/cfb128.c
+++ b/openssl/crypto/modes/cfb128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -59,14 +59,15 @@
#endif
#include <assert.h>
-/* The input and output encrypted as though 128bit cfb mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
+/*
+ * The input and output encrypted as though 128bit cfb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
*/
void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
{
unsigned int n;
size_t l = 0;
@@ -77,166 +78,177 @@ void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
if (enc) {
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = ivec[n] ^= *(in++);
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t)) {
- *(size_t*)(out+n) =
- *(size_t*)(ivec+n) ^= *(size_t*)(in+n);
- }
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- out[n] = ivec[n] ^= in[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while (0);
- /* the rest would be commonly eliminated by x86* compiler */
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = ivec[n] ^= *(in++);
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) %
+ sizeof(size_t) != 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t)) {
+ *(size_t *)(out + n) =
+ *(size_t *)(ivec + n) ^= *(size_t *)(in + n);
+ }
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ out[n] = ivec[n] ^= in[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
#endif
- while (l<len) {
- if (n == 0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = ivec[n] ^= in[l];
- ++l;
- n = (n+1) % 16;
- }
- *num = n;
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = ivec[n] ^= in[l];
+ ++l;
+ n = (n + 1) % 16;
+ }
+ *num = n;
} else {
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- unsigned char c;
- *(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t)) {
- size_t t = *(size_t*)(in+n);
- *(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
- *(size_t*)(ivec+n) = t;
- }
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- unsigned char c;
- out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
- ++n;
- }
- }
- *num = n;
- return;
- } while (0);
- /* the rest would be commonly eliminated by x86* compiler */
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ unsigned char c;
+ *(out++) = ivec[n] ^ (c = *(in++));
+ ivec[n] = c;
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) %
+ sizeof(size_t) != 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t)) {
+ size_t t = *(size_t *)(in + n);
+ *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
+ *(size_t *)(ivec + n) = t;
+ }
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ unsigned char c;
+ out[n] = ivec[n] ^ (c = in[n]);
+ ivec[n] = c;
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
#endif
- while (l<len) {
- unsigned char c;
- if (n == 0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
- ++l;
- n = (n+1) % 16;
- }
- *num=n;
+ while (l < len) {
+ unsigned char c;
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = ivec[n] ^ (c = in[l]);
+ ivec[n] = c;
+ ++l;
+ n = (n + 1) % 16;
+ }
+ *num = n;
}
}
-/* This expects a single block of size nbits for both in and out. Note that
- it corrupts any extra bits in the last byte of out */
-static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
- int nbits,const void *key,
- unsigned char ivec[16],int enc,
- block128_f block)
+/*
+ * This expects a single block of size nbits for both in and out. Note that
+ * it corrupts any extra bits in the last byte of out
+ */
+static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
+ int nbits, const void *key,
+ unsigned char ivec[16], int enc,
+ block128_f block)
{
- int n,rem,num;
- unsigned char ovec[16*2 + 1]; /* +1 because we dererefence (but don't use) one byte off the end */
-
- if (nbits<=0 || nbits>128) return;
-
- /* fill in the first half of the new IV with the current IV */
- memcpy(ovec,ivec,16);
- /* construct the new IV */
- (*block)(ivec,ivec,key);
- num = (nbits+7)/8;
- if (enc) /* encrypt the input */
- for(n=0 ; n < num ; ++n)
- out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
- else /* decrypt the input */
- for(n=0 ; n < num ; ++n)
- out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
- /* shift ovec left... */
- rem = nbits%8;
- num = nbits/8;
- if(rem==0)
- memcpy(ivec,ovec+num,16);
- else
- for(n=0 ; n < 16 ; ++n)
- ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
+ int n, rem, num;
+ unsigned char ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't
+ * use) one byte off the end */
+
+ if (nbits <= 0 || nbits > 128)
+ return;
+
+ /* fill in the first half of the new IV with the current IV */
+ memcpy(ovec, ivec, 16);
+ /* construct the new IV */
+ (*block) (ivec, ivec, key);
+ num = (nbits + 7) / 8;
+ if (enc) /* encrypt the input */
+ for (n = 0; n < num; ++n)
+ out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+ else /* decrypt the input */
+ for (n = 0; n < num; ++n)
+ out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+ /* shift ovec left... */
+ rem = nbits % 8;
+ num = nbits / 8;
+ if (rem == 0)
+ memcpy(ivec, ovec + num, 16);
+ else
+ for (n = 0; n < 16; ++n)
+ ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
/* it is not necessary to cleanse ovec, since the IV is not secret */
}
/* N.B. This expects the input to be packed, MS bit first */
void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
- size_t bits, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
+ size_t bits, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
{
size_t n;
- unsigned char c[1],d[1];
+ unsigned char c[1], d[1];
assert(in && out && key && ivec && num);
assert(*num == 0);
- for(n=0 ; n<bits ; ++n)
- {
- c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
- cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
- out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
- ((d[0]&0x80) >> (unsigned int)(n%8));
- }
+ for (n = 0; n < bits; ++n) {
+ c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+ cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+ out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+ ((d[0] & 0x80) >> (unsigned int)(n % 8));
+ }
}
void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
- size_t length, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block)
+ size_t length, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
{
size_t n;
assert(in && out && key && ivec && num);
assert(*num == 0);
- for(n=0 ; n<length ; ++n)
- cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
+ for (n = 0; n < length; ++n)
+ cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
}
-
diff --git a/openssl/crypto/modes/ctr128.c b/openssl/crypto/modes/ctr128.c
index ee642c586..f3bbcbf72 100644
--- a/openssl/crypto/modes/ctr128.c
+++ b/openssl/crypto/modes/ctr128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -59,194 +59,212 @@
#endif
#include <assert.h>
-/* NOTE: the IV/counter CTR mode is big-endian. The code itself
- * is endian-neutral. */
+/*
+ * NOTE: the IV/counter CTR mode is big-endian. The code itself is
+ * endian-neutral.
+ */
/* increment counter (128-bit int) by 1 */
-static void ctr128_inc(unsigned char *counter) {
- u32 n=16;
- u8 c;
-
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
+static void ctr128_inc(unsigned char *counter)
+{
+ u32 n = 16;
+ u8 c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c)
+ return;
+ } while (n);
}
#if !defined(OPENSSL_SMALL_FOOTPRINT)
-static void ctr128_inc_aligned(unsigned char *counter) {
- size_t *data,c,n;
- const union { long one; char little; } is_endian = {1};
-
- if (is_endian.little) {
- ctr128_inc(counter);
- return;
- }
-
- data = (size_t *)counter;
- n = 16/sizeof(size_t);
- do {
- --n;
- c = data[n];
- ++c;
- data[n] = c;
- if (c) return;
- } while (n);
+static void ctr128_inc_aligned(unsigned char *counter)
+{
+ size_t *data, c, n;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+ if (is_endian.little) {
+ ctr128_inc(counter);
+ return;
+ }
+
+ data = (size_t *)counter;
+ n = 16 / sizeof(size_t);
+ do {
+ --n;
+ c = data[n];
+ ++c;
+ data[n] = c;
+ if (c)
+ return;
+ } while (n);
}
#endif
-/* The input encrypted as though 128bit counter mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num, and the
- * encrypted counter is kept in ecount_buf. Both *num and
- * ecount_buf must be initialised with zeros before the first
- * call to CRYPTO_ctr128_encrypt().
- *
- * This algorithm assumes that the counter is in the x lower bits
- * of the IV (ivec), and that the application has full control over
- * overflow and the rest of the IV. This implementation takes NO
- * responsability for checking that the counter doesn't overflow
- * into the rest of the IV when incremented.
+/*
+ * The input encrypted as though 128bit counter mode is being used. The
+ * extra state information to record how much of the 128bit block we have
+ * used is contained in *num, and the encrypted counter is kept in
+ * ecount_buf. Both *num and ecount_buf must be initialised with zeros
+ * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
+ * that the counter is in the x lower bits of the IV (ivec), and that the
+ * application has full control over overflow and the rest of the IV. This
+ * implementation takes NO responsability for checking that the counter
+ * doesn't overflow into the rest of the IV when incremented.
*/
void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, block128_f block)
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16], unsigned int *num,
+ block128_f block)
{
- unsigned int n;
- size_t l=0;
+ unsigned int n;
+ size_t l = 0;
- assert(in && out && key && ecount_buf && num);
- assert(*num < 16);
+ assert(in && out && key && ecount_buf && num);
+ assert(*num < 16);
- n = *num;
+ n = *num;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = *(in++) ^ ecount_buf[n];
- --len;
- n = (n+1) % 16;
- }
-
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc_aligned(ivec);
- for (; n<16; n+=sizeof(size_t))
- *(size_t *)(out+n) =
- *(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc_aligned(ivec);
- while (len--) {
- out[n] = in[n] ^ ecount_buf[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while(0);
- /* the rest would be commonly eliminated by x86* compiler */
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = *(in++) ^ ecount_buf[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+ 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc_aligned(ivec);
+ for (; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc_aligned(ivec);
+ while (len--) {
+ out[n] = in[n] ^ ecount_buf[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
#endif
- while (l<len) {
- if (n==0) {
- (*block)(ivec, ecount_buf, key);
- ctr128_inc(ivec);
- }
- out[l] = in[l] ^ ecount_buf[n];
- ++l;
- n = (n+1) % 16;
- }
-
- *num=n;
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc(ivec);
+ }
+ out[l] = in[l] ^ ecount_buf[n];
+ ++l;
+ n = (n + 1) % 16;
+ }
+
+ *num = n;
}
/* increment upper 96 bits of 128-bit counter by 1 */
-static void ctr96_inc(unsigned char *counter) {
- u32 n=12;
- u8 c;
-
- do {
- --n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c) return;
- } while (n);
+static void ctr96_inc(unsigned char *counter)
+{
+ u32 n = 12;
+ u8 c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c)
+ return;
+ } while (n);
}
void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, ctr128_f func)
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16],
+ unsigned int *num, ctr128_f func)
{
- unsigned int n,ctr32;
-
- assert(in && out && key && ecount_buf && num);
- assert(*num < 16);
-
- n = *num;
-
- while (n && len) {
- *(out++) = *(in++) ^ ecount_buf[n];
- --len;
- n = (n+1) % 16;
- }
-
- ctr32 = GETU32(ivec+12);
- while (len>=16) {
- size_t blocks = len/16;
- /*
- * 1<<28 is just a not-so-small yet not-so-large number...
- * Below condition is practically never met, but it has to
- * be checked for code correctness.
- */
- if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
- blocks = (1U<<28);
- /*
- * As (*func) operates on 32-bit counter, caller
- * has to handle overflow. 'if' below detects the
- * overflow, which is then handled by limiting the
- * amount of blocks to the exact overflow point...
- */
- ctr32 += (u32)blocks;
- if (ctr32 < blocks) {
- blocks -= ctr32;
- ctr32 = 0;
- }
- (*func)(in,out,blocks,key,ivec);
- /* (*ctr) does not update ivec, caller does: */
- PUTU32(ivec+12,ctr32);
- /* ... overflow was detected, propogate carry. */
- if (ctr32 == 0) ctr96_inc(ivec);
- blocks *= 16;
- len -= blocks;
- out += blocks;
- in += blocks;
- }
- if (len) {
- memset(ecount_buf,0,16);
- (*func)(ecount_buf,ecount_buf,1,key,ivec);
- ++ctr32;
- PUTU32(ivec+12,ctr32);
- if (ctr32 == 0) ctr96_inc(ivec);
- while (len--) {
- out[n] = in[n] ^ ecount_buf[n];
- ++n;
- }
- }
-
- *num=n;
+ unsigned int n, ctr32;
+
+ assert(in && out && key && ecount_buf && num);
+ assert(*num < 16);
+
+ n = *num;
+
+ while (n && len) {
+ *(out++) = *(in++) ^ ecount_buf[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+
+ ctr32 = GETU32(ivec + 12);
+ while (len >= 16) {
+ size_t blocks = len / 16;
+ /*
+ * 1<<28 is just a not-so-small yet not-so-large number...
+ * Below condition is practically never met, but it has to
+ * be checked for code correctness.
+ */
+ if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
+ blocks = (1U << 28);
+ /*
+ * As (*func) operates on 32-bit counter, caller
+ * has to handle overflow. 'if' below detects the
+ * overflow, which is then handled by limiting the
+ * amount of blocks to the exact overflow point...
+ */
+ ctr32 += (u32)blocks;
+ if (ctr32 < blocks) {
+ blocks -= ctr32;
+ ctr32 = 0;
+ }
+ (*func) (in, out, blocks, key, ivec);
+ /* (*ctr) does not update ivec, caller does: */
+ PUTU32(ivec + 12, ctr32);
+ /* ... overflow was detected, propogate carry. */
+ if (ctr32 == 0)
+ ctr96_inc(ivec);
+ blocks *= 16;
+ len -= blocks;
+ out += blocks;
+ in += blocks;
+ }
+ if (len) {
+ memset(ecount_buf, 0, 16);
+ (*func) (ecount_buf, ecount_buf, 1, key, ivec);
+ ++ctr32;
+ PUTU32(ivec + 12, ctr32);
+ if (ctr32 == 0)
+ ctr96_inc(ivec);
+ while (len--) {
+ out[n] = in[n] ^ ecount_buf[n];
+ ++n;
+ }
+ }
+
+ *num = n;
}
diff --git a/openssl/crypto/modes/cts128.c b/openssl/crypto/modes/cts128.c
index 2d583de6f..137be595a 100644
--- a/openssl/crypto/modes/cts128.c
+++ b/openssl/crypto/modes/cts128.c
@@ -29,425 +29,516 @@
* compliant with the NIST proposal, both extending CBC mode.
*/
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{ size_t residue, n;
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len <= 16) return 0;
+ if (len <= 16)
+ return 0;
- if ((residue=len%16) == 0) residue = 16;
+ if ((residue = len % 16) == 0)
+ residue = 16;
- len -= residue;
+ len -= residue;
- CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+ CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
- in += len;
- out += len;
+ in += len;
+ out += len;
- for (n=0; n<residue; ++n)
- ivec[n] ^= in[n];
- (*block)(ivec,ivec,key);
- memcpy(out,out-16,residue);
- memcpy(out-16,ivec,16);
+ for (n = 0; n < residue; ++n)
+ ivec[n] ^= in[n];
+ (*block) (ivec, ivec, key);
+ memcpy(out, out - 16, residue);
+ memcpy(out - 16, ivec, 16);
- return len+residue;
+ return len + residue;
}
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{ size_t residue, n;
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len < 16) return 0;
+ if (len < 16)
+ return 0;
- residue=len%16;
+ residue = len % 16;
- len -= residue;
+ len -= residue;
- CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+ CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
- if (residue==0) return len;
+ if (residue == 0)
+ return len;
- in += len;
- out += len;
+ in += len;
+ out += len;
- for (n=0; n<residue; ++n)
- ivec[n] ^= in[n];
- (*block)(ivec,ivec,key);
- memcpy(out-16+residue,ivec,16);
+ for (n = 0; n < residue; ++n)
+ ivec[n] ^= in[n];
+ (*block) (ivec, ivec, key);
+ memcpy(out - 16 + residue, ivec, 16);
- return len+residue;
+ return len + residue;
}
size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc)
-{ size_t residue;
- union { size_t align; unsigned char c[16]; } tmp;
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[16];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len <= 16) return 0;
+ if (len <= 16)
+ return 0;
- if ((residue=len%16) == 0) residue = 16;
+ if ((residue = len % 16) == 0)
+ residue = 16;
- len -= residue;
+ len -= residue;
- (*cbc)(in,out,len,key,ivec,1);
+ (*cbc) (in, out, len, key, ivec, 1);
- in += len;
- out += len;
+ in += len;
+ out += len;
#if defined(CBC_HANDLES_TRUNCATED_IO)
- memcpy(tmp.c,out-16,16);
- (*cbc)(in,out-16,residue,key,ivec,1);
- memcpy(out,tmp.c,residue);
+ memcpy(tmp.c, out - 16, 16);
+ (*cbc) (in, out - 16, residue, key, ivec, 1);
+ memcpy(out, tmp.c, residue);
#else
- memset(tmp.c,0,sizeof(tmp));
- memcpy(tmp.c,in,residue);
- memcpy(out,out-16,residue);
- (*cbc)(tmp.c,out-16,16,key,ivec,1);
+ memset(tmp.c, 0, sizeof(tmp));
+ memcpy(tmp.c, in, residue);
+ memcpy(out, out - 16, residue);
+ (*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
#endif
- return len+residue;
+ return len + residue;
}
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc)
-{ size_t residue;
- union { size_t align; unsigned char c[16]; } tmp;
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[16];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len < 16) return 0;
+ if (len < 16)
+ return 0;
- residue=len%16;
+ residue = len % 16;
- len -= residue;
+ len -= residue;
- (*cbc)(in,out,len,key,ivec,1);
+ (*cbc) (in, out, len, key, ivec, 1);
- if (residue==0) return len;
+ if (residue == 0)
+ return len;
- in += len;
- out += len;
+ in += len;
+ out += len;
#if defined(CBC_HANDLES_TRUNCATED_IO)
- (*cbc)(in,out-16+residue,residue,key,ivec,1);
+ (*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
#else
- memset(tmp.c,0,sizeof(tmp));
- memcpy(tmp.c,in,residue);
- (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
+ memset(tmp.c, 0, sizeof(tmp));
+ memcpy(tmp.c, in, residue);
+ (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
#endif
- return len+residue;
+ return len + residue;
}
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{ size_t residue, n;
- union { size_t align; unsigned char c[32]; } tmp;
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len<=16) return 0;
+ if (len <= 16)
+ return 0;
- if ((residue=len%16) == 0) residue = 16;
+ if ((residue = len % 16) == 0)
+ residue = 16;
- len -= 16+residue;
+ len -= 16 + residue;
- if (len) {
- CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
- in += len;
- out += len;
- }
+ if (len) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ in += len;
+ out += len;
+ }
- (*block)(in,tmp.c+16,key);
+ (*block) (in, tmp.c + 16, key);
- memcpy(tmp.c,tmp.c+16,16);
- memcpy(tmp.c,in+16,residue);
- (*block)(tmp.c,tmp.c,key);
+ memcpy(tmp.c, tmp.c + 16, 16);
+ memcpy(tmp.c, in + 16, residue);
+ (*block) (tmp.c, tmp.c, key);
- for(n=0; n<16; ++n) {
- unsigned char c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = c;
- }
- for(residue+=16; n<residue; ++n)
- out[n] = tmp.c[n] ^ in[n];
+ for (n = 0; n < 16; ++n) {
+ unsigned char c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ for (residue += 16; n < residue; ++n)
+ out[n] = tmp.c[n] ^ in[n];
- return 16+len+residue;
+ return 16 + len + residue;
}
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block)
-{ size_t residue, n;
- union { size_t align; unsigned char c[32]; } tmp;
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len<16) return 0;
+ if (len < 16)
+ return 0;
- residue=len%16;
+ residue = len % 16;
- if (residue==0) {
- CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
- return len;
- }
+ if (residue == 0) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ return len;
+ }
- len -= 16+residue;
+ len -= 16 + residue;
- if (len) {
- CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
- in += len;
- out += len;
- }
+ if (len) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ in += len;
+ out += len;
+ }
- (*block)(in+residue,tmp.c+16,key);
+ (*block) (in + residue, tmp.c + 16, key);
- memcpy(tmp.c,tmp.c+16,16);
- memcpy(tmp.c,in,residue);
- (*block)(tmp.c,tmp.c,key);
+ memcpy(tmp.c, tmp.c + 16, 16);
+ memcpy(tmp.c, in, residue);
+ (*block) (tmp.c, tmp.c, key);
- for(n=0; n<16; ++n) {
- unsigned char c = in[n];
- out[n] = tmp.c[n] ^ ivec[n];
- ivec[n] = in[n+residue];
- tmp.c[n] = c;
- }
- for(residue+=16; n<residue; ++n)
- out[n] = tmp.c[n] ^ tmp.c[n-16];
+ for (n = 0; n < 16; ++n) {
+ unsigned char c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = in[n + residue];
+ tmp.c[n] = c;
+ }
+ for (residue += 16; n < residue; ++n)
+ out[n] = tmp.c[n] ^ tmp.c[n - 16];
- return 16+len+residue;
+ return 16 + len + residue;
}
size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc)
-{ size_t residue;
- union { size_t align; unsigned char c[32]; } tmp;
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len<=16) return 0;
+ if (len <= 16)
+ return 0;
- if ((residue=len%16) == 0) residue = 16;
+ if ((residue = len % 16) == 0)
+ residue = 16;
- len -= 16+residue;
+ len -= 16 + residue;
- if (len) {
- (*cbc)(in,out,len,key,ivec,0);
- in += len;
- out += len;
- }
+ if (len) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ in += len;
+ out += len;
+ }
- memset(tmp.c,0,sizeof(tmp));
- /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
- (*cbc)(in,tmp.c,16,key,tmp.c+16,0);
+ memset(tmp.c, 0, sizeof(tmp));
+ /*
+ * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+ */
+ (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
- memcpy(tmp.c,in+16,residue);
+ memcpy(tmp.c, in + 16, residue);
#if defined(CBC_HANDLES_TRUNCATED_IO)
- (*cbc)(tmp.c,out,16+residue,key,ivec,0);
+ (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
#else
- (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
- memcpy(out,tmp.c,16+residue);
+ (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+ memcpy(out, tmp.c, 16 + residue);
#endif
- return 16+len+residue;
+ return 16 + len + residue;
}
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc)
-{ size_t residue;
- union { size_t align; unsigned char c[32]; } tmp;
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
- assert (in && out && key && ivec);
+ assert(in && out && key && ivec);
- if (len<16) return 0;
+ if (len < 16)
+ return 0;
- residue=len%16;
+ residue = len % 16;
- if (residue==0) {
- (*cbc)(in,out,len,key,ivec,0);
- return len;
- }
+ if (residue == 0) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ return len;
+ }
- len -= 16+residue;
+ len -= 16 + residue;
- if (len) {
- (*cbc)(in,out,len,key,ivec,0);
- in += len;
- out += len;
- }
+ if (len) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ in += len;
+ out += len;
+ }
- memset(tmp.c,0,sizeof(tmp));
- /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
- (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
+ memset(tmp.c, 0, sizeof(tmp));
+ /*
+ * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+ */
+ (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
- memcpy(tmp.c,in,residue);
+ memcpy(tmp.c, in, residue);
#if defined(CBC_HANDLES_TRUNCATED_IO)
- (*cbc)(tmp.c,out,16+residue,key,ivec,0);
+ (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
#else
- (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
- memcpy(out,tmp.c,16+residue);
+ (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+ memcpy(out, tmp.c, 16 + residue);
#endif
- return 16+len+residue;
+ return 16 + len + residue;
}
#if defined(SELFTEST)
-#include <stdio.h>
-#include <openssl/aes.h>
+# include <stdio.h>
+# include <openssl/aes.h>
/* test vectors from RFC 3962 */
static const unsigned char test_key[16] = "chicken teriyaki";
static const unsigned char test_input[64] =
- "I would like the" " General Gau's C"
- "hicken, please, " "and wonton soup.";
-static const unsigned char test_iv[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-static const unsigned char vector_17[17] =
-{0xc6,0x35,0x35,0x68,0xf2,0xbf,0x8c,0xb4, 0xd8,0xa5,0x80,0x36,0x2d,0xa7,0xff,0x7f,
- 0x97};
-static const unsigned char vector_31[31] =
-{0xfc,0x00,0x78,0x3e,0x0e,0xfd,0xb2,0xc1, 0xd4,0x45,0xd4,0xc8,0xef,0xf7,0xed,0x22,
- 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5};
-static const unsigned char vector_32[32] =
-{0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
- 0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84};
-static const unsigned char vector_47[47] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0xb3,0xff,0xfd,0x94,0x0c,0x16,0xa1,0x8c, 0x1b,0x55,0x49,0xd2,0xf8,0x38,0x02,0x9e,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5};
-static const unsigned char vector_48[48] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8};
-static const unsigned char vector_64[64] =
-{0x97,0x68,0x72,0x68,0xd6,0xec,0xcc,0xc0, 0xc0,0x7b,0x25,0xe2,0x5e,0xcf,0xe5,0x84,
- 0x39,0x31,0x25,0x23,0xa7,0x86,0x62,0xd5, 0xbe,0x7f,0xcb,0xcc,0x98,0xeb,0xf5,0xa8,
- 0x48,0x07,0xef,0xe8,0x36,0xee,0x89,0xa5, 0x26,0x73,0x0d,0xbc,0x2f,0x7b,0xc8,0x40,
- 0x9d,0xad,0x8b,0xbb,0x96,0xc4,0xcd,0xc0, 0x3b,0xc1,0x03,0xe1,0xa1,0x94,0xbb,0xd8};
+ "I would like the" " General Gau's C"
+ "hicken, please, " "and wonton soup.";
+static const unsigned char test_iv[16] =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static const unsigned char vector_17[17] = {
+ 0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4,
+ 0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f,
+ 0x97
+};
+
+static const unsigned char vector_31[31] = {
+ 0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1,
+ 0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22,
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5
+};
+
+static const unsigned char vector_32[32] = {
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84
+};
+
+static const unsigned char vector_47[47] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c,
+ 0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5
+};
+
+static const unsigned char vector_48[48] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+ 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8
+};
+
+static const unsigned char vector_64[64] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+ 0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5,
+ 0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40,
+ 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+ 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8
+};
static AES_KEY encks, decks;
-void test_vector(const unsigned char *vector,size_t len)
-{ unsigned char iv[sizeof(test_iv)];
- unsigned char cleartext[64],ciphertext[64];
- size_t tail;
-
- printf("vector_%d\n",len); fflush(stdout);
-
- if ((tail=len%16) == 0) tail = 16;
- tail += 16;
-
- /* test block-based encryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_cts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
- if (memcmp(ciphertext,vector,len))
- fprintf(stderr,"output_%d mismatch\n",len), exit(1);
- if (memcmp(iv,vector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
-
- /* test block-based decryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_cts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
- if (memcmp(cleartext,test_input,len))
- fprintf(stderr,"input_%d mismatch\n",len), exit(2);
- if (memcmp(iv,vector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
-
- /* test streamed encryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_cts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
- if (memcmp(ciphertext,vector,len))
- fprintf(stderr,"output_%d mismatch\n",len), exit(3);
- if (memcmp(iv,vector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
-
- /* test streamed decryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_cts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
- if (memcmp(cleartext,test_input,len))
- fprintf(stderr,"input_%d mismatch\n",len), exit(4);
- if (memcmp(iv,vector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+void test_vector(const unsigned char *vector, size_t len)
+{
+ unsigned char iv[sizeof(test_iv)];
+ unsigned char cleartext[64], ciphertext[64];
+ size_t tail;
+
+ printf("vector_%d\n", len);
+ fflush(stdout);
+
+ if ((tail = len % 16) == 0)
+ tail = 16;
+ tail += 16;
+
+ /* test block-based encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+ (block128_f) AES_encrypt);
+ if (memcmp(ciphertext, vector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+ /* test block-based decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+ (block128_f) AES_decrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+ /* test streamed encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_encrypt(test_input, ciphertext, len, &encks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(ciphertext, vector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+ /* test streamed decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
}
-void test_nistvector(const unsigned char *vector,size_t len)
-{ unsigned char iv[sizeof(test_iv)];
- unsigned char cleartext[64],ciphertext[64],nistvector[64];
- size_t tail;
-
- printf("nistvector_%d\n",len); fflush(stdout);
-
- if ((tail=len%16) == 0) tail = 16;
-
- len -= 16 + tail;
- memcpy(nistvector,vector,len);
- /* flip two last blocks */
- memcpy(nistvector+len,vector+len+16,tail);
- memcpy(nistvector+len+tail,vector+len,16);
- len += 16 + tail;
- tail = 16;
-
- /* test block-based encryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
- if (memcmp(ciphertext,nistvector,len))
- fprintf(stderr,"output_%d mismatch\n",len), exit(1);
- if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
-
- /* test block-based decryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
- if (memcmp(cleartext,test_input,len))
- fprintf(stderr,"input_%d mismatch\n",len), exit(2);
- if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
-
- /* test streamed encryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
- if (memcmp(ciphertext,nistvector,len))
- fprintf(stderr,"output_%d mismatch\n",len), exit(3);
- if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
-
- /* test streamed decryption */
- memcpy(iv,test_iv,sizeof(test_iv));
- CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
- if (memcmp(cleartext,test_input,len))
- fprintf(stderr,"input_%d mismatch\n",len), exit(4);
- if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
- fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+void test_nistvector(const unsigned char *vector, size_t len)
+{
+ unsigned char iv[sizeof(test_iv)];
+ unsigned char cleartext[64], ciphertext[64], nistvector[64];
+ size_t tail;
+
+ printf("nistvector_%d\n", len);
+ fflush(stdout);
+
+ if ((tail = len % 16) == 0)
+ tail = 16;
+
+ len -= 16 + tail;
+ memcpy(nistvector, vector, len);
+ /* flip two last blocks */
+ memcpy(nistvector + len, vector + len + 16, tail);
+ memcpy(nistvector + len + tail, vector + len, 16);
+ len += 16 + tail;
+ tail = 16;
+
+ /* test block-based encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+ (block128_f) AES_encrypt);
+ if (memcmp(ciphertext, nistvector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+ /* test block-based decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+ (block128_f) AES_decrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+ /* test streamed encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_encrypt(test_input, ciphertext, len, &encks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(ciphertext, nistvector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+ /* test streamed decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
}
int main()
{
- AES_set_encrypt_key(test_key,128,&encks);
- AES_set_decrypt_key(test_key,128,&decks);
-
- test_vector(vector_17,sizeof(vector_17));
- test_vector(vector_31,sizeof(vector_31));
- test_vector(vector_32,sizeof(vector_32));
- test_vector(vector_47,sizeof(vector_47));
- test_vector(vector_48,sizeof(vector_48));
- test_vector(vector_64,sizeof(vector_64));
-
- test_nistvector(vector_17,sizeof(vector_17));
- test_nistvector(vector_31,sizeof(vector_31));
- test_nistvector(vector_32,sizeof(vector_32));
- test_nistvector(vector_47,sizeof(vector_47));
- test_nistvector(vector_48,sizeof(vector_48));
- test_nistvector(vector_64,sizeof(vector_64));
-
- return 0;
+ AES_set_encrypt_key(test_key, 128, &encks);
+ AES_set_decrypt_key(test_key, 128, &decks);
+
+ test_vector(vector_17, sizeof(vector_17));
+ test_vector(vector_31, sizeof(vector_31));
+ test_vector(vector_32, sizeof(vector_32));
+ test_vector(vector_47, sizeof(vector_47));
+ test_vector(vector_48, sizeof(vector_48));
+ test_vector(vector_64, sizeof(vector_64));
+
+ test_nistvector(vector_17, sizeof(vector_17));
+ test_nistvector(vector_31, sizeof(vector_31));
+ test_nistvector(vector_32, sizeof(vector_32));
+ test_nistvector(vector_47, sizeof(vector_47));
+ test_nistvector(vector_48, sizeof(vector_48));
+ test_nistvector(vector_64, sizeof(vector_64));
+
+ return 0;
}
#endif
diff --git a/openssl/crypto/modes/gcm128.c b/openssl/crypto/modes/gcm128.c
index e1dc2b0f4..4debf537f 100644
--- a/openssl/crypto/modes/gcm128.c
+++ b/openssl/crypto/modes/gcm128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -62,27 +62,27 @@
#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
/* redefine, because alignment is ensured */
-#undef GETU32
-#define GETU32(p) BSWAP4(*(const u32 *)(p))
-#undef PUTU32
-#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
-#endif
-
-#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
-#define REDUCE1BIT(V) do { \
- if (sizeof(size_t)==8) { \
- u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
- V.lo = (V.hi<<63)|(V.lo>>1); \
- V.hi = (V.hi>>1 )^T; \
- } \
- else { \
- u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
- V.lo = (V.hi<<63)|(V.lo>>1); \
- V.hi = (V.hi>>1 )^((u64)T<<32); \
- } \
+# undef GETU32
+# define GETU32(p) BSWAP4(*(const u32 *)(p))
+# undef PUTU32
+# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V) do { \
+ if (sizeof(size_t)==8) { \
+ u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^T; \
+ } \
+ else { \
+ u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^((u64)T<<32); \
+ } \
} while(0)
-/*
+/*-
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
* never be set to 8. 8 is effectively reserved for testing purposes.
* TABLE_BITS>1 are lookup-table-driven implementations referred to as
@@ -116,286 +116,311 @@
*
* Value of 1 is not appropriate for performance reasons.
*/
-#if TABLE_BITS==8
+#if TABLE_BITS==8
static void gcm_init_8bit(u128 Htable[256], u64 H[2])
{
- int i, j;
- u128 V;
-
- Htable[0].hi = 0;
- Htable[0].lo = 0;
- V.hi = H[0];
- V.lo = H[1];
-
- for (Htable[128]=V, i=64; i>0; i>>=1) {
- REDUCE1BIT(V);
- Htable[i] = V;
- }
-
- for (i=2; i<256; i<<=1) {
- u128 *Hi = Htable+i, H0 = *Hi;
- for (j=1; j<i; ++j) {
- Hi[j].hi = H0.hi^Htable[j].hi;
- Hi[j].lo = H0.lo^Htable[j].lo;
- }
- }
+ int i, j;
+ u128 V;
+
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+ for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i = 2; i < 256; i <<= 1) {
+ u128 *Hi = Htable + i, H0 = *Hi;
+ for (j = 1; j < i; ++j) {
+ Hi[j].hi = H0.hi ^ Htable[j].hi;
+ Hi[j].lo = H0.lo ^ Htable[j].lo;
+ }
+ }
}
static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
{
- u128 Z = { 0, 0};
- const u8 *xi = (const u8 *)Xi+15;
- size_t rem, n = *xi;
- const union { long one; char little; } is_endian = {1};
- static const size_t rem_8bit[256] = {
- PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
- PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
- PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
- PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
- PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
- PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
- PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
- PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
- PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
- PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
- PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
- PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
- PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
- PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
- PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
- PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
- PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
- PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
- PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
- PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
- PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
- PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
- PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
- PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
- PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
- PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
- PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
- PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
- PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
- PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
- PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
- PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
- PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
- PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
- PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
- PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
- PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
- PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
- PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
- PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
- PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
- PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
- PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
- PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
- PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
- PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
- PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
- PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
- PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
- PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
- PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
- PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
- PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
- PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
- PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
- PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
- PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
- PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
- PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
- PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
- PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
- PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
- PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
- PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
-
- while (1) {
- Z.hi ^= Htable[n].hi;
- Z.lo ^= Htable[n].lo;
-
- if ((u8 *)Xi==xi) break;
-
- n = *(--xi);
-
- rem = (size_t)Z.lo&0xff;
- Z.lo = (Z.hi<<56)|(Z.lo>>8);
- Z.hi = (Z.hi>>8);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_8bit[rem];
- else
- Z.hi ^= (u64)rem_8bit[rem]<<32;
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
+ u128 Z = { 0, 0 };
+ const u8 *xi = (const u8 *)Xi + 15;
+ size_t rem, n = *xi;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ static const size_t rem_8bit[256] = {
+ PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+ PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+ PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+ PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+ PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+ PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+ PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+ PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+ PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+ PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+ PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+ PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+ PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+ PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+ PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+ PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+ PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+ PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+ PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+ PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+ PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+ PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+ PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+ PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+ PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+ PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+ PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+ PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+ PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+ PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+ PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+ PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+ PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+ PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+ PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+ PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+ PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+ PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+ PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+ PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+ PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+ PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+ PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+ PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+ PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+ PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+ PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+ PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+ PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+ PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+ PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+ PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+ PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+ PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+ PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+ PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+ PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+ PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+ PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+ PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+ PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+ PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+ PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+ PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
+ };
+
+ while (1) {
+ Z.hi ^= Htable[n].hi;
+ Z.lo ^= Htable[n].lo;
+
+ if ((u8 *)Xi == xi)
+ break;
+
+ n = *(--xi);
+
+ rem = (size_t)Z.lo & 0xff;
+ Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+ Z.hi = (Z.hi >> 8);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_8bit[rem];
+ else
+ Z.hi ^= (u64)rem_8bit[rem] << 32;
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
}
-#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
-#elif TABLE_BITS==4
+# define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif TABLE_BITS==4
static void gcm_init_4bit(u128 Htable[16], u64 H[2])
{
- u128 V;
-#if defined(OPENSSL_SMALL_FOOTPRINT)
- int i;
-#endif
+ u128 V;
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+ int i;
+# endif
- Htable[0].hi = 0;
- Htable[0].lo = 0;
- V.hi = H[0];
- V.lo = H[1];
-
-#if defined(OPENSSL_SMALL_FOOTPRINT)
- for (Htable[8]=V, i=4; i>0; i>>=1) {
- REDUCE1BIT(V);
- Htable[i] = V;
- }
-
- for (i=2; i<16; i<<=1) {
- u128 *Hi = Htable+i;
- int j;
- for (V=*Hi, j=1; j<i; ++j) {
- Hi[j].hi = V.hi^Htable[j].hi;
- Hi[j].lo = V.lo^Htable[j].lo;
- }
- }
-#else
- Htable[8] = V;
- REDUCE1BIT(V);
- Htable[4] = V;
- REDUCE1BIT(V);
- Htable[2] = V;
- REDUCE1BIT(V);
- Htable[1] = V;
- Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
- V=Htable[4];
- Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
- Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
- Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
- V=Htable[8];
- Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
- Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
- Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
- Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
- Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
- Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
- Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
-#endif
-#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
- /*
- * ARM assembler expects specific dword order in Htable.
- */
- {
- int j;
- const union { long one; char little; } is_endian = {1};
-
- if (is_endian.little)
- for (j=0;j<16;++j) {
- V = Htable[j];
- Htable[j].hi = V.lo;
- Htable[j].lo = V.hi;
- }
- else
- for (j=0;j<16;++j) {
- V = Htable[j];
- Htable[j].hi = V.lo<<32|V.lo>>32;
- Htable[j].lo = V.hi<<32|V.hi>>32;
- }
- }
-#endif
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+ for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i = 2; i < 16; i <<= 1) {
+ u128 *Hi = Htable + i;
+ int j;
+ for (V = *Hi, j = 1; j < i; ++j) {
+ Hi[j].hi = V.hi ^ Htable[j].hi;
+ Hi[j].lo = V.lo ^ Htable[j].lo;
+ }
+ }
+# else
+ Htable[8] = V;
+ REDUCE1BIT(V);
+ Htable[4] = V;
+ REDUCE1BIT(V);
+ Htable[2] = V;
+ REDUCE1BIT(V);
+ Htable[1] = V;
+ Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
+ V = Htable[4];
+ Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
+ Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
+ Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
+ V = Htable[8];
+ Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
+ Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
+ Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
+ Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
+ Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
+ Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
+ Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
+# endif
+# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+ /*
+ * ARM assembler expects specific dword order in Htable.
+ */
+ {
+ int j;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+ if (is_endian.little)
+ for (j = 0; j < 16; ++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo;
+ Htable[j].lo = V.hi;
+ } else
+ for (j = 0; j < 16; ++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo << 32 | V.lo >> 32;
+ Htable[j].lo = V.hi << 32 | V.hi >> 32;
+ }
+ }
+# endif
}
-#ifndef GHASH_ASM
+# ifndef GHASH_ASM
static const size_t rem_4bit[16] = {
- PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
- PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
- PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
- PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+ PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+ PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+ PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+ PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
+};
static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
{
- u128 Z;
- int cnt = 15;
- size_t rem, nlo, nhi;
- const union { long one; char little; } is_endian = {1};
-
- nlo = ((const u8 *)Xi)[15];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi = Htable[nlo].hi;
- Z.lo = Htable[nlo].lo;
-
- while (1) {
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
-
- if (--cnt<0) break;
-
- nlo = ((const u8 *)Xi)[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
+ u128 Z;
+ int cnt = 15;
+ size_t rem, nlo, nhi;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+ nlo = ((const u8 *)Xi)[15];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt < 0)
+ break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
}
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
+# if !defined(OPENSSL_SMALL_FOOTPRINT)
/*
* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
* details... Compiler-generated code doesn't seem to give any
@@ -403,1503 +428,1936 @@ static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
* mostly as reference and a placeholder for possible future
* non-trivial optimization[s]...
*/
-static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len)
+static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len)
{
u128 Z;
int cnt;
size_t rem, nlo, nhi;
- const union { long one; char little; } is_endian = {1};
-
-#if 1
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+# if 1
do {
- cnt = 15;
- nlo = ((const u8 *)Xi)[15];
- nlo ^= inp[15];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- Z.hi = Htable[nlo].hi;
- Z.lo = Htable[nlo].lo;
-
- while (1) {
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
-
- if (--cnt<0) break;
-
- nlo = ((const u8 *)Xi)[cnt];
- nlo ^= inp[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
-
- rem = (size_t)Z.lo&0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
- if (sizeof(size_t)==8)
- Z.hi ^= rem_4bit[rem];
- else
- Z.hi ^= (u64)rem_4bit[rem]<<32;
-
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
- }
-#else
+ cnt = 15;
+ nlo = ((const u8 *)Xi)[15];
+ nlo ^= inp[15];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt < 0)
+ break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+# else
/*
* Extra 256+16 bytes per-key plus 512 bytes shared tables
* [should] give ~50% improvement... One could have PACK()-ed
* the rem_8bit even here, but the priority is to minimize
* cache footprint...
- */
- u128 Hshr4[16]; /* Htable shifted right by 4 bits */
- u8 Hshl4[16]; /* Htable shifted left by 4 bits */
+ */
+ u128 Hshr4[16]; /* Htable shifted right by 4 bits */
+ u8 Hshl4[16]; /* Htable shifted left by 4 bits */
static const unsigned short rem_8bit[256] = {
- 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
- 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
- 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
- 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
- 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
- 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
- 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
- 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
- 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
- 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
- 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
- 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
- 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
- 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
- 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
- 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
- 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
- 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
- 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
- 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
- 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
- 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
- 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
- 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
- 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
- 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
- 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
- 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
- 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
- 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
- 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
- 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+ 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+ 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+ 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+ 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+ 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+ 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+ 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+ 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+ 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+ 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+ 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+ 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+ 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+ 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+ 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+ 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+ 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+ 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+ 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+ 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+ 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+ 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+ 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+ 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+ 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+ 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+ 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+ 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+ 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+ 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+ 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+ 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
+ };
/*
* This pre-processing phase slows down procedure by approximately
* same time as it makes each loop spin faster. In other words
* single block performance is approximately same as straightforward
* "4-bit" implementation, and then it goes only faster...
*/
- for (cnt=0; cnt<16; ++cnt) {
- Z.hi = Htable[cnt].hi;
- Z.lo = Htable[cnt].lo;
- Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
- Hshr4[cnt].hi = (Z.hi>>4);
- Hshl4[cnt] = (u8)(Z.lo<<4);
+ for (cnt = 0; cnt < 16; ++cnt) {
+ Z.hi = Htable[cnt].hi;
+ Z.lo = Htable[cnt].lo;
+ Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
+ Hshr4[cnt].hi = (Z.hi >> 4);
+ Hshl4[cnt] = (u8)(Z.lo << 4);
}
do {
- for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
- nlo = ((const u8 *)Xi)[cnt];
- nlo ^= inp[cnt];
- nhi = nlo>>4;
- nlo &= 0xf;
+ for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
- rem = (size_t)Z.lo&0xff;
+ rem = (size_t)Z.lo & 0xff;
- Z.lo = (Z.hi<<56)|(Z.lo>>8);
- Z.hi = (Z.hi>>8);
+ Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+ Z.hi = (Z.hi >> 8);
- Z.hi ^= Hshr4[nhi].hi;
- Z.lo ^= Hshr4[nhi].lo;
- Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
- }
+ Z.hi ^= Hshr4[nhi].hi;
+ Z.lo ^= Hshr4[nhi].lo;
+ Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
+ }
- nlo = ((const u8 *)Xi)[0];
- nlo ^= inp[0];
- nhi = nlo>>4;
- nlo &= 0xf;
+ nlo = ((const u8 *)Xi)[0];
+ nlo ^= inp[0];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
- Z.hi ^= Htable[nlo].hi;
- Z.lo ^= Htable[nlo].lo;
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
- rem = (size_t)Z.lo&0xf;
+ rem = (size_t)Z.lo & 0xf;
- Z.lo = (Z.hi<<60)|(Z.lo>>4);
- Z.hi = (Z.hi>>4);
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
- Z.hi ^= Htable[nhi].hi;
- Z.lo ^= Htable[nhi].lo;
- Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
-#endif
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+ Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
+# endif
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
- } while (inp+=16, len-=16);
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+ } while (inp += 16, len -= 16);
}
-#endif
-#else
-void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
+# endif
+# else
+void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
-#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
-#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
-/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
- * trashing effect. In other words idea is to hash data while it's
- * still in L1 cache after encryption pass... */
-#define GHASH_CHUNK (3*1024)
-#endif
+# define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/*
+ * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
+ * effect. In other words idea is to hash data while it's still in L1 cache
+ * after encryption pass...
+ */
+# define GHASH_CHUNK (3*1024)
+# endif
-#else /* TABLE_BITS */
+#else /* TABLE_BITS */
-static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
{
- u128 V,Z = { 0,0 };
- long X;
- int i,j;
- const long *xi = (const long *)Xi;
- const union { long one; char little; } is_endian = {1};
-
- V.hi = H[0]; /* H is in host byte order, no byte swapping */
- V.lo = H[1];
-
- for (j=0; j<16/sizeof(long); ++j) {
- if (is_endian.little) {
- if (sizeof(long)==8) {
-#ifdef BSWAP8
- X = (long)(BSWAP8(xi[j]));
-#else
- const u8 *p = (const u8 *)(xi+j);
- X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
-#endif
- }
- else {
- const u8 *p = (const u8 *)(xi+j);
- X = (long)GETU32(p);
- }
- }
- else
- X = xi[j];
-
- for (i=0; i<8*sizeof(long); ++i, X<<=1) {
- u64 M = (u64)(X>>(8*sizeof(long)-1));
- Z.hi ^= V.hi&M;
- Z.lo ^= V.lo&M;
-
- REDUCE1BIT(V);
- }
- }
-
- if (is_endian.little) {
-#ifdef BSWAP8
- Xi[0] = BSWAP8(Z.hi);
- Xi[1] = BSWAP8(Z.lo);
-#else
- u8 *p = (u8 *)Xi;
- u32 v;
- v = (u32)(Z.hi>>32); PUTU32(p,v);
- v = (u32)(Z.hi); PUTU32(p+4,v);
- v = (u32)(Z.lo>>32); PUTU32(p+8,v);
- v = (u32)(Z.lo); PUTU32(p+12,v);
-#endif
- }
- else {
- Xi[0] = Z.hi;
- Xi[1] = Z.lo;
- }
+ u128 V, Z = { 0, 0 };
+ long X;
+ int i, j;
+ const long *xi = (const long *)Xi;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+ V.hi = H[0]; /* H is in host byte order, no byte swapping */
+ V.lo = H[1];
+
+ for (j = 0; j < 16 / sizeof(long); ++j) {
+ if (is_endian.little) {
+ if (sizeof(long) == 8) {
+# ifdef BSWAP8
+ X = (long)(BSWAP8(xi[j]));
+# else
+ const u8 *p = (const u8 *)(xi + j);
+ X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
+# endif
+ } else {
+ const u8 *p = (const u8 *)(xi + j);
+ X = (long)GETU32(p);
+ }
+ } else
+ X = xi[j];
+
+ for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
+ u64 M = (u64)(X >> (8 * sizeof(long) - 1));
+ Z.hi ^= V.hi & M;
+ Z.lo ^= V.lo & M;
+
+ REDUCE1BIT(V);
+ }
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
}
-#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+# define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
#endif
-#if TABLE_BITS==4 && defined(GHASH_ASM)
-# if !defined(I386_ONLY) && \
- (defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
+# if !defined(I386_ONLY) && \
+ (defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define GHASH_ASM_X86_OR_64
# define GCM_FUNCREF_4BIT
extern unsigned int OPENSSL_ia32cap_P[2];
-void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define gcm_init_avx gcm_init_clmul
+# define gcm_gmult_avx gcm_gmult_clmul
+# define gcm_ghash_avx gcm_ghash_clmul
+# else
+void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
-void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
-void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
# endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
# include "arm_arch.h"
-# if __ARM_ARCH__>=7
+# if __ARM_MAX_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
-void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
+# if defined(__arm__) || defined(__arm)
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
# endif
+# elif defined(__sparc__) || defined(__sparc)
+# include "sparc_arch.h"
+# define GHASH_ASM_SPARC
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_sparcv9cap_P[];
+void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# include "ppc_arch.h"
+# define GHASH_ASM_PPC
+# define GCM_FUNCREF_4BIT
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
# endif
#endif
#ifdef GCM_FUNCREF_4BIT
# undef GCM_MUL
-# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
# ifdef GHASH
# undef GHASH
-# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
# endif
#endif
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
{
- const union { long one; char little; } is_endian = {1};
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
- memset(ctx,0,sizeof(*ctx));
- ctx->block = block;
- ctx->key = key;
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->block = block;
+ ctx->key = key;
- (*block)(ctx->H.c,ctx->H.c,key);
+ (*block) (ctx->H.c, ctx->H.c, key);
- if (is_endian.little) {
- /* H is stored in host byte order */
+ if (is_endian.little) {
+ /* H is stored in host byte order */
#ifdef BSWAP8
- ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
- ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+ ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+ ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
#else
- u8 *p = ctx->H.c;
- u64 hi,lo;
- hi = (u64)GETU32(p) <<32|GETU32(p+4);
- lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
- ctx->H.u[0] = hi;
- ctx->H.u[1] = lo;
+ u8 *p = ctx->H.c;
+ u64 hi, lo;
+ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+ ctx->H.u[0] = hi;
+ ctx->H.u[1] = lo;
#endif
- }
-
-#if TABLE_BITS==8
- gcm_init_8bit(ctx->Htable,ctx->H.u);
-#elif TABLE_BITS==4
-# if defined(GHASH_ASM_X86_OR_64)
-# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
- if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
- OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
- gcm_init_clmul(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_clmul;
- ctx->ghash = gcm_ghash_clmul;
- return;
- }
+ }
+#if TABLE_BITS==8
+ gcm_init_8bit(ctx->Htable, ctx->H.u);
+#elif TABLE_BITS==4
+# if defined(GHASH_ASM_X86_OR_64)
+# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
+ OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
+ if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
+ gcm_init_avx(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_avx;
+ ctx->ghash = gcm_ghash_avx;
+ } else {
+ gcm_init_clmul(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_clmul;
+ ctx->ghash = gcm_ghash_clmul;
+ }
+ return;
+ }
# endif
- gcm_init_4bit(ctx->Htable,ctx->H.u);
-# if defined(GHASH_ASM_X86) /* x86 only */
-# if defined(OPENSSL_IA32_SSE2)
- if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+# if defined(GHASH_ASM_X86) /* x86 only */
+# if defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
# else
- if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
+ if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
# endif
- ctx->gmult = gcm_gmult_4bit_mmx;
- ctx->ghash = gcm_ghash_4bit_mmx;
- } else {
- ctx->gmult = gcm_gmult_4bit_x86;
- ctx->ghash = gcm_ghash_4bit_x86;
- }
+ ctx->gmult = gcm_gmult_4bit_mmx;
+ ctx->ghash = gcm_ghash_4bit_mmx;
+ } else {
+ ctx->gmult = gcm_gmult_4bit_x86;
+ ctx->ghash = gcm_ghash_4bit_x86;
+ }
# else
- ctx->gmult = gcm_gmult_4bit;
- ctx->ghash = gcm_ghash_4bit;
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+# endif
+# elif defined(GHASH_ASM_ARM)
+# ifdef PMULL_CAPABLE
+ if (PMULL_CAPABLE) {
+ gcm_init_v8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_v8;
+ ctx->ghash = gcm_ghash_v8;
+ } else
# endif
-# elif defined(GHASH_ASM_ARM)
- if (OPENSSL_armcap_P & ARMV7_NEON) {
- ctx->gmult = gcm_gmult_neon;
- ctx->ghash = gcm_ghash_neon;
- } else {
- gcm_init_4bit(ctx->Htable,ctx->H.u);
- ctx->gmult = gcm_gmult_4bit;
- ctx->ghash = gcm_ghash_4bit;
- }
+# ifdef NEON_CAPABLE
+ if (NEON_CAPABLE) {
+ gcm_init_neon(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_neon;
+ ctx->ghash = gcm_ghash_neon;
+ } else
+# endif
+ {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
+# elif defined(GHASH_ASM_SPARC)
+ if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
+ gcm_init_vis3(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_vis3;
+ ctx->ghash = gcm_ghash_vis3;
+ } else {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
+# elif defined(GHASH_ASM_PPC)
+ if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
+ gcm_init_p8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_p8;
+ ctx->ghash = gcm_ghash_p8;
+ } else {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
# else
- gcm_init_4bit(ctx->Htable,ctx->H.u);
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
# endif
#endif
}
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+ size_t len)
{
- const union { long one; char little; } is_endian = {1};
- unsigned int ctr;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ unsigned int ctr;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
-#endif
-
- ctx->Yi.u[0] = 0;
- ctx->Yi.u[1] = 0;
- ctx->Xi.u[0] = 0;
- ctx->Xi.u[1] = 0;
- ctx->len.u[0] = 0; /* AAD length */
- ctx->len.u[1] = 0; /* message length */
- ctx->ares = 0;
- ctx->mres = 0;
-
- if (len==12) {
- memcpy(ctx->Yi.c,iv,12);
- ctx->Yi.c[15]=1;
- ctr=1;
- }
- else {
- size_t i;
- u64 len0 = len;
-
- while (len>=16) {
- for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
- GCM_MUL(ctx,Yi);
- iv += 16;
- len -= 16;
- }
- if (len) {
- for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
- GCM_MUL(ctx,Yi);
- }
- len0 <<= 3;
- if (is_endian.little) {
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+ ctx->Yi.u[0] = 0;
+ ctx->Yi.u[1] = 0;
+ ctx->Xi.u[0] = 0;
+ ctx->Xi.u[1] = 0;
+ ctx->len.u[0] = 0; /* AAD length */
+ ctx->len.u[1] = 0; /* message length */
+ ctx->ares = 0;
+ ctx->mres = 0;
+
+ if (len == 12) {
+ memcpy(ctx->Yi.c, iv, 12);
+ ctx->Yi.c[15] = 1;
+ ctr = 1;
+ } else {
+ size_t i;
+ u64 len0 = len;
+
+ while (len >= 16) {
+ for (i = 0; i < 16; ++i)
+ ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx, Yi);
+ iv += 16;
+ len -= 16;
+ }
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx, Yi);
+ }
+ len0 <<= 3;
+ if (is_endian.little) {
#ifdef BSWAP8
- ctx->Yi.u[1] ^= BSWAP8(len0);
+ ctx->Yi.u[1] ^= BSWAP8(len0);
#else
- ctx->Yi.c[8] ^= (u8)(len0>>56);
- ctx->Yi.c[9] ^= (u8)(len0>>48);
- ctx->Yi.c[10] ^= (u8)(len0>>40);
- ctx->Yi.c[11] ^= (u8)(len0>>32);
- ctx->Yi.c[12] ^= (u8)(len0>>24);
- ctx->Yi.c[13] ^= (u8)(len0>>16);
- ctx->Yi.c[14] ^= (u8)(len0>>8);
- ctx->Yi.c[15] ^= (u8)(len0);
+ ctx->Yi.c[8] ^= (u8)(len0 >> 56);
+ ctx->Yi.c[9] ^= (u8)(len0 >> 48);
+ ctx->Yi.c[10] ^= (u8)(len0 >> 40);
+ ctx->Yi.c[11] ^= (u8)(len0 >> 32);
+ ctx->Yi.c[12] ^= (u8)(len0 >> 24);
+ ctx->Yi.c[13] ^= (u8)(len0 >> 16);
+ ctx->Yi.c[14] ^= (u8)(len0 >> 8);
+ ctx->Yi.c[15] ^= (u8)(len0);
#endif
- }
- else
- ctx->Yi.u[1] ^= len0;
+ } else
+ ctx->Yi.u[1] ^= len0;
- GCM_MUL(ctx,Yi);
+ GCM_MUL(ctx, Yi);
- if (is_endian.little)
+ if (is_endian.little)
#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
+ ctr = BSWAP4(ctx->Yi.d[3]);
#else
- ctr = GETU32(ctx->Yi.c+12);
+ ctr = GETU32(ctx->Yi.c + 12);
#endif
- else
- ctr = ctx->Yi.d[3];
- }
+ else
+ ctr = ctx->Yi.d[3];
+ }
- (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
- ++ctr;
- if (is_endian.little)
+ (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
+ ++ctr;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
+ PUTU32(ctx->Yi.c + 12, ctr);
#endif
- else
- ctx->Yi.d[3] = ctr;
+ else
+ ctx->Yi.d[3] = ctr;
}
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+ size_t len)
{
- size_t i;
- unsigned int n;
- u64 alen = ctx->len.u[0];
+ size_t i;
+ unsigned int n;
+ u64 alen = ctx->len.u[0];
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
# endif
#endif
- if (ctx->len.u[1]) return -2;
-
- alen += len;
- if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
- return -1;
- ctx->len.u[0] = alen;
-
- n = ctx->ares;
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(aad++);
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->ares = n;
- return 0;
- }
- }
-
+ if (ctx->len.u[1])
+ return -2;
+
+ alen += len;
+ if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
+ return -1;
+ ctx->len.u[0] = alen;
+
+ n = ctx->ares;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(aad++);
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->ares = n;
+ return 0;
+ }
+ }
#ifdef GHASH
- if ((i = (len&(size_t)-16))) {
- GHASH(ctx,aad,i);
- aad += i;
- len -= i;
- }
+ if ((i = (len & (size_t)-16))) {
+ GHASH(ctx, aad, i);
+ aad += i;
+ len -= i;
+ }
#else
- while (len>=16) {
- for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
- GCM_MUL(ctx,Xi);
- aad += 16;
- len -= 16;
- }
+ while (len >= 16) {
+ for (i = 0; i < 16; ++i)
+ ctx->Xi.c[i] ^= aad[i];
+ GCM_MUL(ctx, Xi);
+ aad += 16;
+ len -= 16;
+ }
#endif
- if (len) {
- n = (unsigned int)len;
- for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
- }
+ if (len) {
+ n = (unsigned int)len;
+ for (i = 0; i < len; ++i)
+ ctx->Xi.c[i] ^= aad[i];
+ }
- ctx->ares = n;
- return 0;
+ ctx->ares = n;
+ return 0;
}
int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len)
+ const unsigned char *in, unsigned char *out,
+ size_t len)
{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- block128_f block = ctx->block;
- void *key = ctx->key;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
# endif
#endif
#if 0
- n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+ n = (unsigned int)mlen % 16; /* alternative to ctx->mres */
#endif
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
-
- if (ctx->ares) {
- /* First call to encrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
-
- if (is_endian.little)
-#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
-#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
- n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
- break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
- while (len>=GHASH_CHUNK) {
- size_t j=GHASH_CHUNK;
-
- while (j) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctr = BSWAP4(ctx->Yi.d[3]);
#else
- PUTU32(ctx->Yi.c+12,ctr);
+ ctr = GETU32(ctx->Yi.c + 12);
#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i] ^ ctx->EKi.t[i];
- out += 16;
- in += 16;
- j -= 16;
- }
- GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
- len -= GHASH_CHUNK;
- }
- if ((i = (len&(size_t)-16))) {
- size_t j=i;
-
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i] ^ ctx->EKi.t[i];
- out += 16;
- in += 16;
- len -= 16;
- }
- GHASH(ctx,out-j,j);
- }
-#else
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
+ else
+ ctr = ctx->Yi.d[3];
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- ctx->Xi.t[i] ^=
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- GCM_MUL(ctx,Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
- if (len) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
- } while(0);
+ n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+ break;
+# endif
+# if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ size_t j = GHASH_CHUNK;
+
+ while (j) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
+ len -= GHASH_CHUNK;
+ }
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i;
+
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ GHASH(ctx, out - j, j);
+ }
+# else
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+# endif
+ if (len) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while (0);
+ }
#endif
- for (i=0;i<len;++i) {
- if (n==0) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
+ for (i = 0; i < len; ++i) {
+ if (n == 0) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- }
- ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
- n = (n+1)%16;
- if (n==0)
- GCM_MUL(ctx,Xi);
- }
-
- ctx->mres = n;
- return 0;
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
+ n = (n + 1) % 16;
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ }
+
+ ctx->mres = n;
+ return 0;
}
int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len)
+ const unsigned char *in, unsigned char *out,
+ size_t len)
{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- block128_f block = ctx->block;
- void *key = ctx->key;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
# endif
#endif
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
- if (ctx->ares) {
- /* First call to decrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
- if (is_endian.little)
+ if (is_endian.little)
#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
+ ctr = BSWAP4(ctx->Yi.d[3]);
#else
- ctr = GETU32(ctx->Yi.c+12);
+ ctr = GETU32(ctx->Yi.c + 12);
#endif
- else
- ctr = ctx->Yi.d[3];
+ else
+ ctr = ctx->Yi.d[3];
- n = ctx->mres;
+ n = ctx->mres;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- if (n) {
- while (n && len) {
- u8 c = *(in++);
- *(out++) = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL (ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
- break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
- while (len>=GHASH_CHUNK) {
- size_t j=GHASH_CHUNK;
-
- GHASH(ctx,in,GHASH_CHUNK);
- while (j) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+ break;
+# endif
+# if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ size_t j = GHASH_CHUNK;
+
+ GHASH(ctx, in, GHASH_CHUNK);
+ while (j) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ len -= GHASH_CHUNK;
+ }
+ if ((i = (len & (size_t)-16))) {
+ GHASH(ctx, in, i);
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ }
+# else
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i) {
+ size_t c = in[i];
+ out[i] = c ^ ctx->EKi.t[i];
+ ctx->Xi.t[i] ^= c;
+ }
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+# endif
+ if (len) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while (0);
+ }
#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- out += 16;
- in += 16;
- j -= 16;
- }
- len -= GHASH_CHUNK;
- }
- if ((i = (len&(size_t)-16))) {
- GHASH(ctx,in,i);
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
-
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
+ for (i = 0; i < len; ++i) {
+ u8 c;
+ if (n == 0) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i)
- out_t[i] = in_t[i]^ctx->EKi.t[i];
- out += 16;
- in += 16;
- len -= 16;
- }
- }
-#else
- while (len>=16) {
- size_t *out_t=(size_t *)out;
- const size_t *in_t=(const size_t *)in;
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ c = in[i];
+ out[i] = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ n = (n + 1) % 16;
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ }
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- for (i=0; i<16/sizeof(size_t); ++i) {
- size_t c = in[i];
- out[i] = c^ctx->EKi.t[i];
- ctx->Xi.t[i] ^= c;
- }
- GCM_MUL(ctx,Xi);
- out += 16;
- in += 16;
- len -= 16;
- }
-#endif
- if (len) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- u8 c = in[n];
- ctx->Xi.c[n] ^= c;
- out[n] = c^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
- } while(0);
-#endif
- for (i=0;i<len;++i) {
- u8 c;
- if (n==0) {
- (*block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- }
- c = in[i];
- out[i] = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- n = (n+1)%16;
- if (n==0)
- GCM_MUL(ctx,Xi);
- }
-
- ctx->mres = n;
- return 0;
+ ctx->mres = n;
+ return 0;
}
int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len, ctr128_f stream)
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream)
{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- void *key = ctx->key;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
# endif
#endif
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
- if (ctx->ares) {
- /* First call to encrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
- if (is_endian.little)
+ if (is_endian.little)
#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
+ ctr = BSWAP4(ctx->Yi.d[3]);
#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
- if (n) {
- while (n && len) {
- ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL(ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
+ ctr = GETU32(ctx->Yi.c + 12);
+#endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
- while (len>=GHASH_CHUNK) {
- (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
- ctr += GHASH_CHUNK/16;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- GHASH(ctx,out,GHASH_CHUNK);
- out += GHASH_CHUNK;
- in += GHASH_CHUNK;
- len -= GHASH_CHUNK;
- }
+ while (len >= GHASH_CHUNK) {
+ (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+ ctr += GHASH_CHUNK / 16;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ GHASH(ctx, out, GHASH_CHUNK);
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
#endif
- if ((i = (len&(size_t)-16))) {
- size_t j=i/16;
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i / 16;
- (*stream)(in,out,j,key,ctx->Yi.c);
- ctr += (unsigned int)j;
- if (is_endian.little)
+ (*stream) (in, out, j, key, ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
+ PUTU32(ctx->Yi.c + 12, ctr);
#endif
- else
- ctx->Yi.d[3] = ctr;
- in += i;
- len -= i;
+ else
+ ctx->Yi.d[3] = ctr;
+ in += i;
+ len -= i;
#if defined(GHASH)
- GHASH(ctx,out,i);
- out += i;
+ GHASH(ctx, out, i);
+ out += i;
#else
- while (j--) {
- for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
- GCM_MUL(ctx,Xi);
- out += 16;
- }
+ while (j--) {
+ for (i = 0; i < 16; ++i)
+ ctx->Xi.c[i] ^= out[i];
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ }
#endif
- }
- if (len) {
- (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
+ }
+ if (len) {
+ (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
}
int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len,ctr128_f stream)
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream)
{
- const union { long one; char little; } is_endian = {1};
- unsigned int n, ctr;
- size_t i;
- u64 mlen = ctx->len.u[1];
- void *key = ctx->key;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
# ifdef GHASH
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx->ghash;
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
# endif
#endif
- mlen += len;
- if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
- return -1;
- ctx->len.u[1] = mlen;
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
- if (ctx->ares) {
- /* First call to decrypt finalizes GHASH(AAD) */
- GCM_MUL(ctx,Xi);
- ctx->ares = 0;
- }
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
- if (is_endian.little)
+ if (is_endian.little)
#ifdef BSWAP4
- ctr = BSWAP4(ctx->Yi.d[3]);
+ ctr = BSWAP4(ctx->Yi.d[3]);
#else
- ctr = GETU32(ctx->Yi.c+12);
-#endif
- else
- ctr = ctx->Yi.d[3];
-
- n = ctx->mres;
- if (n) {
- while (n && len) {
- u8 c = *(in++);
- *(out++) = c^ctx->EKi.c[n];
- ctx->Xi.c[n] ^= c;
- --len;
- n = (n+1)%16;
- }
- if (n==0) GCM_MUL (ctx,Xi);
- else {
- ctx->mres = n;
- return 0;
- }
- }
+ ctr = GETU32(ctx->Yi.c + 12);
+#endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
- while (len>=GHASH_CHUNK) {
- GHASH(ctx,in,GHASH_CHUNK);
- (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
- ctr += GHASH_CHUNK/16;
- if (is_endian.little)
-#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
-#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- out += GHASH_CHUNK;
- in += GHASH_CHUNK;
- len -= GHASH_CHUNK;
- }
+ while (len >= GHASH_CHUNK) {
+ GHASH(ctx, in, GHASH_CHUNK);
+ (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+ ctr += GHASH_CHUNK / 16;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
#endif
- if ((i = (len&(size_t)-16))) {
- size_t j=i/16;
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i / 16;
#if defined(GHASH)
- GHASH(ctx,in,i);
+ GHASH(ctx, in, i);
#else
- while (j--) {
- size_t k;
- for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
- GCM_MUL(ctx,Xi);
- in += 16;
- }
- j = i/16;
- in -= i;
-#endif
- (*stream)(in,out,j,key,ctx->Yi.c);
- ctr += (unsigned int)j;
- if (is_endian.little)
+ while (j--) {
+ size_t k;
+ for (k = 0; k < 16; ++k)
+ ctx->Xi.c[k] ^= in[k];
+ GCM_MUL(ctx, Xi);
+ in += 16;
+ }
+ j = i / 16;
+ in -= i;
+#endif
+ (*stream) (in, out, j, key, ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
+ PUTU32(ctx->Yi.c + 12, ctr);
#endif
- else
- ctx->Yi.d[3] = ctr;
- out += i;
- in += i;
- len -= i;
- }
- if (len) {
- (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
- ++ctr;
- if (is_endian.little)
+ else
+ ctx->Yi.d[3] = ctr;
+ out += i;
+ in += i;
+ len -= i;
+ }
+ if (len) {
+ (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
#ifdef BSWAP4
- ctx->Yi.d[3] = BSWAP4(ctr);
+ ctx->Yi.d[3] = BSWAP4(ctr);
#else
- PUTU32(ctx->Yi.c+12,ctr);
-#endif
- else
- ctx->Yi.d[3] = ctr;
- while (len--) {
- u8 c = in[n];
- ctx->Xi.c[n] ^= c;
- out[n] = c^ctx->EKi.c[n];
- ++n;
- }
- }
-
- ctx->mres = n;
- return 0;
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
}
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
- size_t len)
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
+ size_t len)
{
- const union { long one; char little; } is_endian = {1};
- u64 alen = ctx->len.u[0]<<3;
- u64 clen = ctx->len.u[1]<<3;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ u64 alen = ctx->len.u[0] << 3;
+ u64 clen = ctx->len.u[1] << 3;
#ifdef GCM_FUNCREF_4BIT
- void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
#endif
- if (ctx->mres || ctx->ares)
- GCM_MUL(ctx,Xi);
+ if (ctx->mres || ctx->ares)
+ GCM_MUL(ctx, Xi);
- if (is_endian.little) {
+ if (is_endian.little) {
#ifdef BSWAP8
- alen = BSWAP8(alen);
- clen = BSWAP8(clen);
+ alen = BSWAP8(alen);
+ clen = BSWAP8(clen);
#else
- u8 *p = ctx->len.c;
+ u8 *p = ctx->len.c;
- ctx->len.u[0] = alen;
- ctx->len.u[1] = clen;
+ ctx->len.u[0] = alen;
+ ctx->len.u[1] = clen;
- alen = (u64)GETU32(p) <<32|GETU32(p+4);
- clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+ alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
+ clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
#endif
- }
+ }
- ctx->Xi.u[0] ^= alen;
- ctx->Xi.u[1] ^= clen;
- GCM_MUL(ctx,Xi);
+ ctx->Xi.u[0] ^= alen;
+ ctx->Xi.u[1] ^= clen;
+ GCM_MUL(ctx, Xi);
- ctx->Xi.u[0] ^= ctx->EK0.u[0];
- ctx->Xi.u[1] ^= ctx->EK0.u[1];
+ ctx->Xi.u[0] ^= ctx->EK0.u[0];
+ ctx->Xi.u[1] ^= ctx->EK0.u[1];
- if (tag && len<=sizeof(ctx->Xi))
- return memcmp(ctx->Xi.c,tag,len);
- else
- return -1;
+ if (tag && len <= sizeof(ctx->Xi))
+ return memcmp(ctx->Xi.c, tag, len);
+ else
+ return -1;
}
void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
{
- CRYPTO_gcm128_finish(ctx, NULL, 0);
- memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+ CRYPTO_gcm128_finish(ctx, NULL, 0);
+ memcpy(tag, ctx->Xi.c,
+ len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
}
GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
{
- GCM128_CONTEXT *ret;
+ GCM128_CONTEXT *ret;
- if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
- CRYPTO_gcm128_init(ret,key,block);
+ if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+ CRYPTO_gcm128_init(ret, key, block);
- return ret;
+ return ret;
}
void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
{
- if (ctx) {
- OPENSSL_cleanse(ctx,sizeof(*ctx));
- OPENSSL_free(ctx);
- }
+ if (ctx) {
+ OPENSSL_cleanse(ctx, sizeof(*ctx));
+ OPENSSL_free(ctx);
+ }
}
#if defined(SELFTEST)
-#include <stdio.h>
-#include <openssl/aes.h>
+# include <stdio.h>
+# include <openssl/aes.h>
/* Test Case 1 */
-static const u8 K1[16],
- *P1=NULL,
- *A1=NULL,
- IV1[12],
- *C1=NULL,
- T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL;
+static const u8 T1[] = {
+ 0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61,
+ 0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a
+};
/* Test Case 2 */
-#define K2 K1
-#define A2 A1
-#define IV2 IV1
-static const u8 P2[16],
- C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
- T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+# define K2 K1
+# define A2 A1
+# define IV2 IV1
+static const u8 P2[16];
+static const u8 C2[] = {
+ 0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92,
+ 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78
+};
+
+static const u8 T2[] = {
+ 0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd,
+ 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf
+};
/* Test Case 3 */
-#define A3 A2
-static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
- P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
- 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
- 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
- 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
- T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+# define A3 A2
+static const u8 K3[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P3[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV3[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C3[] = {
+ 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+ 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+ 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+ 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+ 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+ 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+ 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+ 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85
+};
+
+static const u8 T3[] = {
+ 0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6,
+ 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4
+};
/* Test Case 4 */
-#define K4 K3
-#define IV4 IV3
-static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
- 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
- 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
- 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
- T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+# define K4 K3
+# define IV4 IV3
+static const u8 P4[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A4[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C4[] = {
+ 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+ 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+ 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+ 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+ 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+ 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+ 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+ 0x3d, 0x58, 0xe0, 0x91
+};
+
+static const u8 T4[] = {
+ 0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb,
+ 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47
+};
/* Test Case 5 */
-#define K5 K4
-#define P5 P4
-#define A5 A4
-static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
- 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
- 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
- 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
- T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+# define K5 K4
+# define P5 P4
+# define A5 A4
+static const u8 IV5[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad
+};
+
+static const u8 C5[] = {
+ 0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a,
+ 0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55,
+ 0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8,
+ 0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23,
+ 0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2,
+ 0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42,
+ 0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07,
+ 0xc2, 0x3f, 0x45, 0x98
+};
+
+static const u8 T5[] = {
+ 0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85,
+ 0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb
+};
/* Test Case 6 */
-#define K6 K5
-#define P6 P5
-#define A6 A5
-static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
- 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
- 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
- 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
- T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+# define K6 K5
+# define P6 P5
+# define A6 A5
+static const u8 IV6[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C6[] = {
+ 0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6,
+ 0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94,
+ 0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8,
+ 0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7,
+ 0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90,
+ 0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f,
+ 0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03,
+ 0x4c, 0x34, 0xae, 0xe5
+};
+
+static const u8 T6[] = {
+ 0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa,
+ 0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50
+};
/* Test Case 7 */
-static const u8 K7[24],
- *P7=NULL,
- *A7=NULL,
- IV7[12],
- *C7=NULL,
- T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL;
+static const u8 T7[] = {
+ 0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b,
+ 0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35
+};
/* Test Case 8 */
-#define K8 K7
-#define IV8 IV7
-#define A8 A7
-static const u8 P8[16],
- C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
- T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+# define K8 K7
+# define IV8 IV7
+# define A8 A7
+static const u8 P8[16];
+static const u8 C8[] = {
+ 0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41,
+ 0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00
+};
+
+static const u8 T8[] = {
+ 0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab,
+ 0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb
+};
/* Test Case 9 */
-#define A9 A8
-static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
- 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
- P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
- 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
- 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
- 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
- T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+# define A9 A8
+static const u8 K9[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c
+};
+
+static const u8 P9[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV9[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C9[] = {
+ 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+ 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+ 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+ 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+ 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+ 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+ 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+ 0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56
+};
+
+static const u8 T9[] = {
+ 0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf,
+ 0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14
+};
/* Test Case 10 */
-#define K10 K9
-#define IV10 IV9
-static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
- 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
- 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
- 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
- T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+# define K10 K9
+# define IV10 IV9
+static const u8 P10[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A10[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C10[] = {
+ 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+ 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+ 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+ 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+ 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+ 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+ 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+ 0xcc, 0xda, 0x27, 0x10
+};
+
+static const u8 T10[] = {
+ 0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f,
+ 0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c
+};
/* Test Case 11 */
-#define K11 K10
-#define P11 P10
-#define A11 A10
-static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
- 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
- 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
- 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
- T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+# define K11 K10
+# define P11 P10
+# define A11 A10
+static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C11[] = {
+ 0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54,
+ 0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8,
+ 0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f,
+ 0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57,
+ 0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75,
+ 0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9,
+ 0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f,
+ 0xa0, 0xf0, 0x62, 0xf7
+};
+
+static const u8 T11[] = {
+ 0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24,
+ 0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8
+};
/* Test Case 12 */
-#define K12 K11
-#define P12 P11
-#define A12 A11
-static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
- 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
- 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
- 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
- T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+# define K12 K11
+# define P12 P11
+# define A12 A11
+static const u8 IV12[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C12[] = {
+ 0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c,
+ 0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff,
+ 0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef,
+ 0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45,
+ 0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9,
+ 0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3,
+ 0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7,
+ 0xe9, 0xb7, 0x37, 0x3b
+};
+
+static const u8 T12[] = {
+ 0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb,
+ 0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9
+};
/* Test Case 13 */
-static const u8 K13[32],
- *P13=NULL,
- *A13=NULL,
- IV13[12],
- *C13=NULL,
- T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL;
+static const u8 T13[] = {
+ 0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9,
+ 0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b
+};
/* Test Case 14 */
-#define K14 K13
-#define A14 A13
-static const u8 P14[16],
- IV14[12],
- C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
- T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+# define K14 K13
+# define A14 A13
+static const u8 P14[16], IV14[12];
+static const u8 C14[] = {
+ 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
+ 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+
+static const u8 T14[] = {
+ 0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0,
+ 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19
+};
/* Test Case 15 */
-#define A15 A14
-static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
- 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
- P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
- IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
- C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
- T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+# define A15 A14
+static const u8 K15[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P15[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV15[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C15[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T15[] = {
+ 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd,
+ 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c
+};
/* Test Case 16 */
-#define K16 K15
-#define IV16 IV15
-static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
- A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
- 0xab,0xad,0xda,0xd2},
- C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
- T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+# define K16 K15
+# define IV16 IV15
+static const u8 P16[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A16[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C16[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62
+};
+
+static const u8 T16[] = {
+ 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
+ 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b
+};
/* Test Case 17 */
-#define K17 K16
-#define P17 P16
-#define A17 A16
-static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
- C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
- 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
- 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
- 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
- T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+# define K17 K16
+# define P17 P16
+# define A17 A16
+static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C17[] = {
+ 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32,
+ 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+ 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa,
+ 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+ 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0,
+ 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+ 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99,
+ 0xf4, 0x7c, 0x9b, 0x1f
+};
+
+static const u8 T17[] = {
+ 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4,
+ 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2
+};
/* Test Case 18 */
-#define K18 K17
-#define P18 P17
-#define A18 A17
-static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
- 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
- 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
- 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
- C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
- 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
- 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
- 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
- T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+# define K18 K17
+# define P18 P17
+# define A18 A17
+static const u8 IV18[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C18[] = {
+ 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
+ 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
+ 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
+ 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+ 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
+ 0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
+ 0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e,
+ 0x44, 0xae, 0x7e, 0x3f
+};
+
+static const u8 T18[] = {
+ 0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0,
+ 0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a
+};
/* Test Case 19 */
-#define K19 K1
-#define P19 P1
-#define IV19 IV1
-#define C19 C1
-static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
- 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
- 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
- 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
- 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
- 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
- 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
- 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
- T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
+# define K19 K1
+# define P19 P1
+# define IV19 IV1
+# define C19 C1
+static const u8 A19[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55,
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T19[] = {
+ 0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d,
+ 0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92
+};
/* Test Case 20 */
-#define K20 K1
-#define A20 A1
-static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
- P20[288],
- C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
- 0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
- 0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
- 0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
- 0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
- 0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
- 0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
- 0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
- 0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
- 0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
- 0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
- 0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
- 0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
- 0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
- 0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
- 0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
- 0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
- 0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
- T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
-
-#define TEST_CASE(n) do { \
- u8 out[sizeof(P##n)]; \
- AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
- CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
- CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
- memset(out,0,sizeof(out)); \
- if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
- if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
- if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
- (C##n && memcmp(out,C##n,sizeof(out)))) \
- ret++, printf ("encrypt test#%d failed.\n",n); \
- CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
- memset(out,0,sizeof(out)); \
- if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
- if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
- if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
- (P##n && memcmp(out,P##n,sizeof(out)))) \
- ret++, printf ("decrypt test#%d failed.\n",n); \
- } while(0)
+# define K20 K1
+# define A20 A1
+/* this results in 0xff in counter LSB */
+static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff };
+
+static const u8 P20[288];
+static const u8 C20[] = {
+ 0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a,
+ 0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14,
+ 0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce,
+ 0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f,
+ 0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70,
+ 0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18,
+ 0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf,
+ 0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49,
+ 0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab,
+ 0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c,
+ 0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c,
+ 0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29,
+ 0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1,
+ 0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76,
+ 0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2,
+ 0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce,
+ 0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f,
+ 0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86,
+ 0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb,
+ 0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18,
+ 0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65,
+ 0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42,
+ 0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b,
+ 0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06,
+ 0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24,
+ 0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c,
+ 0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4,
+ 0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64,
+ 0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03,
+ 0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6,
+ 0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90,
+ 0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74,
+ 0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67,
+ 0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46,
+ 0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78,
+ 0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c
+};
+
+static const u8 T20[] = {
+ 0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a,
+ 0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f
+};
+
+# define TEST_CASE(n) do { \
+ u8 out[sizeof(P##n)]; \
+ AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
+ CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (C##n && memcmp(out,C##n,sizeof(out)))) \
+ ret++, printf ("encrypt test#%d failed.\n",n); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (P##n && memcmp(out,P##n,sizeof(out)))) \
+ ret++, printf ("decrypt test#%d failed.\n",n); \
+ } while(0)
int main()
{
- GCM128_CONTEXT ctx;
- AES_KEY key;
- int ret=0;
-
- TEST_CASE(1);
- TEST_CASE(2);
- TEST_CASE(3);
- TEST_CASE(4);
- TEST_CASE(5);
- TEST_CASE(6);
- TEST_CASE(7);
- TEST_CASE(8);
- TEST_CASE(9);
- TEST_CASE(10);
- TEST_CASE(11);
- TEST_CASE(12);
- TEST_CASE(13);
- TEST_CASE(14);
- TEST_CASE(15);
- TEST_CASE(16);
- TEST_CASE(17);
- TEST_CASE(18);
- TEST_CASE(19);
- TEST_CASE(20);
-
-#ifdef OPENSSL_CPUID_OBJ
- {
- size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
- union { u64 u; u8 c[1024]; } buf;
- int i;
-
- AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
- CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
- CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
-
- CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
- start = OPENSSL_rdtsc();
- CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
- gcm_t = OPENSSL_rdtsc() - start;
-
- CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
- &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
- (block128_f)AES_encrypt);
- start = OPENSSL_rdtsc();
- CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
- &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
- (block128_f)AES_encrypt);
- ctr_t = OPENSSL_rdtsc() - start;
-
- printf("%.2f-%.2f=%.2f\n",
- gcm_t/(double)sizeof(buf),
- ctr_t/(double)sizeof(buf),
- (gcm_t-ctr_t)/(double)sizeof(buf));
-#ifdef GHASH
- {
- void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
- const u8 *inp,size_t len) = ctx.ghash;
-
- GHASH((&ctx),buf.c,sizeof(buf));
- start = OPENSSL_rdtsc();
- for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
- gcm_t = OPENSSL_rdtsc() - start;
- printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
- }
-#endif
- }
-#endif
+ GCM128_CONTEXT ctx;
+ AES_KEY key;
+ int ret = 0;
+
+ TEST_CASE(1);
+ TEST_CASE(2);
+ TEST_CASE(3);
+ TEST_CASE(4);
+ TEST_CASE(5);
+ TEST_CASE(6);
+ TEST_CASE(7);
+ TEST_CASE(8);
+ TEST_CASE(9);
+ TEST_CASE(10);
+ TEST_CASE(11);
+ TEST_CASE(12);
+ TEST_CASE(13);
+ TEST_CASE(14);
+ TEST_CASE(15);
+ TEST_CASE(16);
+ TEST_CASE(17);
+ TEST_CASE(18);
+ TEST_CASE(19);
+ TEST_CASE(20);
+
+# ifdef OPENSSL_CPUID_OBJ
+ {
+ size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc();
+ union {
+ u64 u;
+ u8 c[1024];
+ } buf;
+ int i;
+
+ AES_set_encrypt_key(K1, sizeof(K1) * 8, &key);
+ CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt);
+ CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1));
+
+ CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+ start = OPENSSL_rdtsc();
+ CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+
+ CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+ &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+ (block128_f) AES_encrypt);
+ start = OPENSSL_rdtsc();
+ CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+ &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+ (block128_f) AES_encrypt);
+ ctr_t = OPENSSL_rdtsc() - start;
+
+ printf("%.2f-%.2f=%.2f\n",
+ gcm_t / (double)sizeof(buf),
+ ctr_t / (double)sizeof(buf),
+ (gcm_t - ctr_t) / (double)sizeof(buf));
+# ifdef GHASH
+ {
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx.ghash;
+
+ GHASH((&ctx), buf.c, sizeof(buf));
+ start = OPENSSL_rdtsc();
+ for (i = 0; i < 100; ++i)
+ GHASH((&ctx), buf.c, sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+ printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i);
+ }
+# endif
+ }
+# endif
- return ret;
+ return ret;
}
#endif
diff --git a/openssl/crypto/modes/modes.h b/openssl/crypto/modes/modes.h
index 7773c2542..fd488499a 100644
--- a/openssl/crypto/modes/modes.h
+++ b/openssl/crypto/modes/modes.h
@@ -10,132 +10,154 @@
#ifdef __cplusplus
extern "C" {
#endif
-typedef void (*block128_f)(const unsigned char in[16],
- unsigned char out[16],
- const void *key);
+typedef void (*block128_f) (const unsigned char in[16],
+ unsigned char out[16], const void *key);
-typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int enc);
+typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], int enc);
-typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
- size_t blocks, const void *key,
- const unsigned char ivec[16]);
+typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
+ size_t blocks, const void *key,
+ const unsigned char ivec[16]);
-typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
- size_t blocks, const void *key,
- const unsigned char ivec[16],unsigned char cmac[16]);
+typedef void (*ccm128_f) (const unsigned char *in, unsigned char *out,
+ size_t blocks, const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block);
void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block);
void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16], unsigned int *num,
+ block128_f block);
void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], unsigned char ecount_buf[16],
- unsigned int *num, ctr128_f ctr);
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16],
+ unsigned int *num, ctr128_f ctr);
void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num,
+ block128_f block);
void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block);
void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
- size_t length, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block);
+ size_t length, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block);
void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
- size_t bits, const void *key,
- unsigned char ivec[16], int *num,
- int enc, block128_f block);
-
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t bits, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block);
+
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block);
size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block);
size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc);
-
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc);
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block);
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], block128_f block);
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block);
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], cbc128_f cbc);
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc);
typedef struct gcm128_context GCM128_CONTEXT;
GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);
void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
- size_t len);
+ size_t len);
int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
- size_t len);
+ size_t len);
int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len);
+ const unsigned char *in, unsigned char *out,
+ size_t len);
int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len);
+ const unsigned char *in, unsigned char *out,
+ size_t len);
int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len, ctr128_f stream);
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream);
int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
- const unsigned char *in, unsigned char *out,
- size_t len, ctr128_f stream);
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
- size_t len);
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
+ size_t len);
void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
typedef struct ccm128_context CCM128_CONTEXT;
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
- unsigned int M, unsigned int L, void *key,block128_f block);
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
- const unsigned char *nonce, size_t nlen, size_t mlen);
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
- const unsigned char *aad, size_t alen);
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out, size_t len,
- ccm128_f stream);
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
- const unsigned char *inp, unsigned char *out, size_t len,
- ccm128_f stream);
+ unsigned int M, unsigned int L, void *key,
+ block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, const unsigned char *nonce,
+ size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, const unsigned char *aad,
+ size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
+ unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, const unsigned char *inp,
+ unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
+ unsigned char *out, size_t len,
+ ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, const unsigned char *inp,
+ unsigned char *out, size_t len,
+ ccm128_f stream);
size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
typedef struct xts128_context XTS128_CONTEXT;
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
- const unsigned char *inp, unsigned char *out, size_t len, int enc);
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+ const unsigned char iv[16],
+ const unsigned char *inp, unsigned char *out,
+ size_t len, int enc);
+
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block);
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block);
+
#ifdef __cplusplus
}
#endif
diff --git a/openssl/crypto/modes/modes_lcl.h b/openssl/crypto/modes/modes_lcl.h
index 9d83e1284..900f54ca2 100644
--- a/openssl/crypto/modes/modes_lcl.h
+++ b/openssl/crypto/modes/modes_lcl.h
@@ -7,122 +7,137 @@
#include <openssl/modes.h>
-
#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
typedef __int64 i64;
typedef unsigned __int64 u64;
-#define U64(C) C##UI64
+# define U64(C) C##UI64
#elif defined(__arch64__)
typedef long i64;
typedef unsigned long u64;
-#define U64(C) C##UL
+# define U64(C) C##UL
#else
typedef long long i64;
typedef unsigned long long u64;
-#define U64(C) C##ULL
+# define U64(C) C##ULL
#endif
typedef unsigned int u32;
typedef unsigned char u8;
#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
+#ifndef PEDANTIC
+# if defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__aarch64__) || \
+ defined(__s390__) || defined(__s390x__)
+# undef STRICT_ALIGNMENT
+# endif
#endif
#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#if defined(__GNUC__) && __GNUC__>=2
-# if defined(__x86_64) || defined(__x86_64__)
-# define BSWAP8(x) ({ u64 ret=(x); \
- asm ("bswapq %0" \
- : "+r"(ret)); ret; })
-# define BSWAP4(x) ({ u32 ret=(x); \
- asm ("bswapl %0" \
- : "+r"(ret)); ret; })
-# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
- asm ("bswapl %0; bswapl %1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret=(x); \
- asm ("bswapl %0" \
- : "+r"(ret)); ret; })
-# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
-# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
- asm ("rev %0,%0; rev %1,%1" \
- : "+r"(hi),"+r"(lo)); \
- (u64)hi<<32|lo; })
-# define BSWAP4(x) ({ u32 ret; \
- asm ("rev %0,%1" \
- : "=r"(ret) : "r"((u32)(x))); \
- ret; })
+# if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+# define BSWAP8(x) ({ u64 ret=(x); \
+ asm ("bswapq %0" \
+ : "+r"(ret)); ret; })
+# define BSWAP4(x) ({ u32 ret=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret)); ret; })
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+ asm ("bswapl %0; bswapl %1" \
+ : "+r"(hi),"+r"(lo)); \
+ (u64)hi<<32|lo; })
+# define BSWAP4(x) ({ u32 ret=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret)); ret; })
+# elif defined(__aarch64__)
+# define BSWAP8(x) ({ u64 ret; \
+ asm ("rev %0,%1" \
+ : "=r"(ret) : "r"(x)); ret; })
+# define BSWAP4(x) ({ u32 ret; \
+ asm ("rev %w0,%w1" \
+ : "=r"(ret) : "r"(x)); ret; })
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+ asm ("rev %0,%0; rev %1,%1" \
+ : "+r"(hi),"+r"(lo)); \
+ (u64)hi<<32|lo; })
+# define BSWAP4(x) ({ u32 ret; \
+ asm ("rev %0,%1" \
+ : "=r"(ret) : "r"((u32)(x))); \
+ ret; })
+# endif
+# elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+# define BSWAP8(x) _byteswap_uint64((u64)(x))
+# define BSWAP4(x) _byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+__inline u32 _bswap4(u32 val)
+{
+_asm mov eax, val _asm bswap eax}
+# define BSWAP4(x) _bswap4(x)
+# endif
# endif
-#elif defined(_MSC_VER)
-# if _MSC_VER>=1300
-# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
-# define BSWAP8(x) _byteswap_uint64((u64)(x))
-# define BSWAP4(x) _byteswap_ulong((u32)(x))
-# elif defined(_M_IX86)
- __inline u32 _bswap4(u32 val) {
- _asm mov eax,val
- _asm bswap eax
- }
-# define BSWAP4(x) _bswap4(x)
-# endif
-#endif
#endif
-
#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
-#define GETU32(p) BSWAP4(*(const u32 *)(p))
-#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+# define GETU32(p) BSWAP4(*(const u32 *)(p))
+# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
#else
-#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+# define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+# define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
#endif
+/*- GCM definitions */ typedef struct {
+ u64 hi, lo;
+} u128;
-/* GCM definitions */
-
-typedef struct { u64 hi,lo; } u128;
-
-#ifdef TABLE_BITS
-#undef TABLE_BITS
+#ifdef TABLE_BITS
+# undef TABLE_BITS
#endif
/*
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
* never be set to 8 [or 1]. For further information see gcm128.c.
*/
-#define TABLE_BITS 4
+#define TABLE_BITS 4
struct gcm128_context {
- /* Following 6 names follow names in GCM specification */
- union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
- Yi,EKi,EK0,len,Xi,H;
- /* Relative position of Xi, H and pre-computed Htable is used
- * in some assembler modules, i.e. don't change the order! */
+ /* Following 6 names follow names in GCM specification */
+ union {
+ u64 u[2];
+ u32 d[4];
+ u8 c[16];
+ size_t t[16 / sizeof(size_t)];
+ } Yi, EKi, EK0, len, Xi, H;
+ /*
+ * Relative position of Xi, H and pre-computed Htable is used in some
+ * assembler modules, i.e. don't change the order!
+ */
#if TABLE_BITS==8
- u128 Htable[256];
+ u128 Htable[256];
#else
- u128 Htable[16];
- void (*gmult)(u64 Xi[2],const u128 Htable[16]);
- void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+ u128 Htable[16];
+ void (*gmult) (u64 Xi[2], const u128 Htable[16]);
+ void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
#endif
- unsigned int mres, ares;
- block128_f block;
- void *key;
+ unsigned int mres, ares;
+ block128_f block;
+ void *key;
};
struct xts128_context {
- void *key1, *key2;
- block128_f block1,block2;
+ void *key1, *key2;
+ block128_f block1, block2;
};
struct ccm128_context {
- union { u64 u[2]; u8 c[16]; } nonce, cmac;
- u64 blocks;
- block128_f block;
- void *key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } nonce, cmac;
+ u64 blocks;
+ block128_f block;
+ void *key;
};
-
diff --git a/openssl/crypto/modes/ofb128.c b/openssl/crypto/modes/ofb128.c
index 01c01702c..4dbaccd7a 100644
--- a/openssl/crypto/modes/ofb128.c
+++ b/openssl/crypto/modes/ofb128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -59,63 +59,66 @@
#endif
#include <assert.h>
-/* The input and output encrypted as though 128bit ofb mode is being
- * used. The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
+/*
+ * The input and output encrypted as though 128bit ofb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
*/
void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
- size_t len, const void *key,
- unsigned char ivec[16], int *num,
- block128_f block)
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num, block128_f block)
{
- unsigned int n;
- size_t l=0;
+ unsigned int n;
+ size_t l = 0;
- assert(in && out && key && ivec && num);
+ assert(in && out && key && ivec && num);
- n = *num;
+ n = *num;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
- if (16%sizeof(size_t) == 0) do { /* always true actually */
- while (n && len) {
- *(out++) = *(in++) ^ ivec[n];
- --len;
- n = (n+1) % 16;
- }
-#if defined(STRICT_ALIGNMENT)
- if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
- break;
-#endif
- while (len>=16) {
- (*block)(ivec, ivec, key);
- for (; n<16; n+=sizeof(size_t))
- *(size_t*)(out+n) =
- *(size_t*)(in+n) ^ *(size_t*)(ivec+n);
- len -= 16;
- out += 16;
- in += 16;
- n = 0;
- }
- if (len) {
- (*block)(ivec, ivec, key);
- while (len--) {
- out[n] = in[n] ^ ivec[n];
- ++n;
- }
- }
- *num = n;
- return;
- } while(0);
- /* the rest would be commonly eliminated by x86* compiler */
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = *(in++) ^ ivec[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+ 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(ivec + n);
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ out[n] = in[n] ^ ivec[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
#endif
- while (l<len) {
- if (n==0) {
- (*block)(ivec, ivec, key);
- }
- out[l] = in[l] ^ ivec[n];
- ++l;
- n = (n+1) % 16;
- }
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = in[l] ^ ivec[n];
+ ++l;
+ n = (n + 1) % 16;
+ }
- *num=n;
+ *num = n;
}
diff --git a/openssl/crypto/modes/wrap128.c b/openssl/crypto/modes/wrap128.c
new file mode 100755
index 000000000..4dcaf0326
--- /dev/null
+++ b/openssl/crypto/modes/wrap128.c
@@ -0,0 +1,138 @@
+/* crypto/modes/wrap128.c */
+/*
+ * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/modes.h>
+
+static const unsigned char default_iv[] = {
+ 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
+};
+
+/*
+ * Input size limit: lower than maximum of standards but far larger than
+ * anything that will be used in practice.
+ */
+#define CRYPTO128_WRAP_MAX (1UL << 31)
+
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ if ((inlen & 0x7) || (inlen < 8) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 1;
+ memcpy(out + 8, in, inlen);
+ if (!iv)
+ iv = default_iv;
+
+ memcpy(A, iv, 8);
+
+ for (j = 0; j < 6; j++) {
+ R = out + 8;
+ for (i = 0; i < inlen; i += 8, t++, R += 8) {
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(R, B + 8, 8);
+ }
+ }
+ memcpy(out, A, 8);
+ return inlen + 8;
+}
+
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ inlen -= 8;
+ if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 6 * (inlen >> 3);
+ memcpy(A, in, 8);
+ memcpy(out, in + 8, inlen);
+ for (j = 0; j < 6; j++) {
+ R = out + inlen - 8;
+ for (i = 0; i < inlen; i += 8, t--, R -= 8) {
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ memcpy(R, B + 8, 8);
+ }
+ }
+ if (!iv)
+ iv = default_iv;
+ if (memcmp(A, iv, 8)) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+ return inlen;
+}
diff --git a/openssl/crypto/modes/xts128.c b/openssl/crypto/modes/xts128.c
index 9cf27a25e..8f2af588b 100644
--- a/openssl/crypto/modes/xts128.c
+++ b/openssl/crypto/modes/xts128.c
@@ -6,7 +6,7 @@
* are met:
*
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
@@ -58,130 +58,147 @@
#endif
#include <assert.h>
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
- const unsigned char *inp, unsigned char *out,
- size_t len, int enc)
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+ const unsigned char iv[16],
+ const unsigned char *inp, unsigned char *out,
+ size_t len, int enc)
{
- const union { long one; char little; } is_endian = {1};
- union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
- unsigned int i;
-
- if (len<16) return -1;
-
- memcpy(tweak.c, iv, 16);
-
- (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
-
- if (!enc && (len%16)) len-=16;
-
- while (len>=16) {
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ union {
+ u64 u[2];
+ u32 d[4];
+ u8 c[16];
+ } tweak, scratch;
+ unsigned int i;
+
+ if (len < 16)
+ return -1;
+
+ memcpy(tweak.c, iv, 16);
+
+ (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
+
+ if (!enc && (len % 16))
+ len -= 16;
+
+ while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
- memcpy(scratch.c,inp,16);
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
+ memcpy(scratch.c, inp, 16);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
#else
- scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
- scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+ scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
+ scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
#endif
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy(out,scratch.c,16);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out, scratch.c, 16);
#else
- ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
- ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+ ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
+ ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
#endif
- inp += 16;
- out += 16;
- len -= 16;
-
- if (len==0) return 0;
-
- if (is_endian.little) {
- unsigned int carry,res;
-
- res = 0x87&(((int)tweak.d[3])>>31);
- carry = (unsigned int)(tweak.u[0]>>63);
- tweak.u[0] = (tweak.u[0]<<1)^res;
- tweak.u[1] = (tweak.u[1]<<1)|carry;
- }
- else {
- size_t c;
-
- for (c=0,i=0;i<16;++i) {
- /*+ substitutes for |, because c is 1 bit */
- c += ((size_t)tweak.c[i])<<1;
- tweak.c[i] = (u8)c;
- c = c>>8;
- }
- tweak.c[0] ^= (u8)(0x87&(0-c));
- }
- }
- if (enc) {
- for (i=0;i<len;++i) {
- u8 c = inp[i];
- out[i] = scratch.c[i];
- scratch.c[i] = c;
- }
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy(out-16,scratch.c,16);
- }
- else {
- union { u64 u[2]; u8 c[16]; } tweak1;
-
- if (is_endian.little) {
- unsigned int carry,res;
-
- res = 0x87&(((int)tweak.d[3])>>31);
- carry = (unsigned int)(tweak.u[0]>>63);
- tweak1.u[0] = (tweak.u[0]<<1)^res;
- tweak1.u[1] = (tweak.u[1]<<1)|carry;
- }
- else {
- size_t c;
-
- for (c=0,i=0;i<16;++i) {
- /*+ substitutes for |, because c is 1 bit */
- c += ((size_t)tweak.c[i])<<1;
- tweak1.c[i] = (u8)c;
- c = c>>8;
- }
- tweak1.c[0] ^= (u8)(0x87&(0-c));
- }
+ inp += 16;
+ out += 16;
+ len -= 16;
+
+ if (len == 0)
+ return 0;
+
+ if (is_endian.little) {
+ unsigned int carry, res;
+
+ res = 0x87 & (((int)tweak.d[3]) >> 31);
+ carry = (unsigned int)(tweak.u[0] >> 63);
+ tweak.u[0] = (tweak.u[0] << 1) ^ res;
+ tweak.u[1] = (tweak.u[1] << 1) | carry;
+ } else {
+ size_t c;
+
+ for (c = 0, i = 0; i < 16; ++i) {
+ /*
+ * + substitutes for |, because c is 1 bit
+ */
+ c += ((size_t)tweak.c[i]) << 1;
+ tweak.c[i] = (u8)c;
+ c = c >> 8;
+ }
+ tweak.c[0] ^= (u8)(0x87 & (0 - c));
+ }
+ }
+ if (enc) {
+ for (i = 0; i < len; ++i) {
+ u8 c = inp[i];
+ out[i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out - 16, scratch.c, 16);
+ } else {
+ union {
+ u64 u[2];
+ u8 c[16];
+ } tweak1;
+
+ if (is_endian.little) {
+ unsigned int carry, res;
+
+ res = 0x87 & (((int)tweak.d[3]) >> 31);
+ carry = (unsigned int)(tweak.u[0] >> 63);
+ tweak1.u[0] = (tweak.u[0] << 1) ^ res;
+ tweak1.u[1] = (tweak.u[1] << 1) | carry;
+ } else {
+ size_t c;
+
+ for (c = 0, i = 0; i < 16; ++i) {
+ /*
+ * + substitutes for |, because c is 1 bit
+ */
+ c += ((size_t)tweak.c[i]) << 1;
+ tweak1.c[i] = (u8)c;
+ c = c >> 8;
+ }
+ tweak1.c[0] ^= (u8)(0x87 & (0 - c));
+ }
#if defined(STRICT_ALIGNMENT)
- memcpy(scratch.c,inp,16);
- scratch.u[0] ^= tweak1.u[0];
- scratch.u[1] ^= tweak1.u[1];
+ memcpy(scratch.c, inp, 16);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
#else
- scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
- scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+ scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
+ scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
#endif
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
- scratch.u[0] ^= tweak1.u[0];
- scratch.u[1] ^= tweak1.u[1];
-
- for (i=0;i<len;++i) {
- u8 c = inp[16+i];
- out[16+i] = scratch.c[i];
- scratch.c[i] = c;
- }
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
+
+ for (i = 0; i < len; ++i) {
+ u8 c = inp[16 + i];
+ out[16 + i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
- scratch.u[0] ^= tweak.u[0];
- scratch.u[1] ^= tweak.u[1];
- memcpy (out,scratch.c,16);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out, scratch.c, 16);
#else
- ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
- ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+ ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
+ ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
#endif
- }
+ }
- return 0;
+ return 0;
}