1 files changed, 429 insertions, 0 deletions
diff --git a/openssl/crypto/modes/asm/ghash-armv4.pl b/openssl/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 000000000..d91586ee2
--- /dev/null
+++ b/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;	# $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		$Htbl,#16		@ point at H in GCM128_CTX
+	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
+	veor		$zero,$zero
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Qpost,$Qpost
+	veor		$R,$R
+	mov		$cnt,#16
+	veor		$Z,$Z
+	mov		$len,#16
+	veor		$Zo,$Zo
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
+	veor		$zero,$zero
+	nop
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+.Louter_neon:
+	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
+	veor		$Qpost,$Qpost
+	vld1.64		`&Dlo($IN)`,[$inp]!
+	veor		$R,$R
+	mov		$cnt,#16
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Zo,$Zo
+	veor		$IN,$Z			@ inp^=Xi
+	veor		$Z,$Z
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		$cnt,$cnt,#1
+	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo�Xi[i]
+	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi�Xi[i]
+	vext.8		$IN,$zero,#1		@ IN>>=8
+
+	veor		$Z,$Qpost		@ modulo-scheduled part
+	vshl.i64	`&Dlo("$R")`,#48
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+	vuzp.8		$Qlo,$Qhi
+	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
+	vext.8		$Z,$zero,#1		@ Z>>=8
+
+	vmull.p8	$R,$Zo,$mod		@ "carry"�0xe1
+	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
+	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
+	veor		$Z,$Qhi
+	bne		.Linner_neon
+
+	veor		$Z,$Qpost		@ modulo-scheduled artefact
+	vshl.i64	`&Dlo("$R")`,#48
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+
+	@ finalization, normalize Z:Zo
+	vand		$Zo,$mod		@ suffices to mask the bit
+	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+	vshl.i64	$Z,#1
+	subs		$len,#16
+	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+	sub		$Xi,#16	
+	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush