3 files changed, 72 insertions, 14 deletions
diff --git a/openssl/crypto/sha/asm/sha1-armv8.pl b/openssl/crypto/sha/asm/sha1-armv8.pl
index deb1238d3..c04432a54 100755
--- a/openssl/crypto/sha/asm/sha1-armv8.pl
+++ b/openssl/crypto/sha/asm/sha1-armv8.pl
@@ -14,10 +14,14 @@
 #
 #		hardware-assisted	software(*)
 # Apple A7	2.31			4.13 (+14%)
-# Cortex-A53	2.19			8.73 (+108%)
+# Cortex-A53	2.24			8.03 (+97%)
 # Cortex-A57	2.35			7.88 (+74%)
+# Denver	2.13			3.97 (+0%)(**)
+# X-Gene				8.80 (+200%)
 #
 # (*)	Software results are presented mostly for reference purposes.
+# (**)	Keep in mind that Denver relies on binary translation, which
+#	optimizes compiler output at run-time.
 
 $flavour = shift;
 open STDOUT,">".shift;
diff --git a/openssl/crypto/sha/asm/sha256-armv4.pl b/openssl/crypto/sha/asm/sha256-armv4.pl
index f14c9c3cb..4fee74d83 100644
--- a/openssl/crypto/sha/asm/sha256-armv4.pl
+++ b/openssl/crypto/sha/asm/sha256-armv4.pl
@@ -5,6 +5,8 @@
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
 # ====================================================================
 
 # SHA256 block procedure for ARMv4. May 2007.
@@ -151,10 +153,24 @@ ___
 }
 
 $code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
 
 .text
+#if __ARM_ARCH__<7
 .code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
 
 .type	K256,%object
 .align	5
@@ -177,7 +193,7 @@ K256:
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
 .word	OPENSSL_armcap_P-sha256_block_data_order
 #endif
@@ -186,9 +202,12 @@ K256:
 .global	sha256_block_data_order
 .type	sha256_block_data_order,%function
 sha256_block_data_order:
+#if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha256_block_data_order
-	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 	tst	r12,#ARMV8_SHA256
@@ -196,6 +215,7 @@ sha256_block_data_order:
 	tst	r12,#ARMV7_NEON
 	bne	.LNEON
 #endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256+32	@ K256
@@ -213,6 +233,9 @@ for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 $code.=".Lrounds_16_xx:\n";
 for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
 	ldreq	$t3,[sp,#16*4]		@ pull ctx
 	bne	.Lrounds_16_xx
 
@@ -429,16 +452,19 @@ $code.=<<___;
 .arch	armv7-a
 .fpu	neon
 
+.global	sha256_block_data_order_neon
 .type	sha256_block_data_order_neon,%function
 .align	4
 sha256_block_data_order_neon:
 .LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
-	sub	sp,sp,#16*4+16		@ alloca
-	sub	$Ktbl,r3,#256+32	@ K256
-	bic	sp,sp,#15		@ align for 128-bit stores
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 
 	vld1.8		{@X[0]},[$inp]!
 	vld1.8		{@X[1]},[$inp]!
@@ -490,11 +516,13 @@ $code.=<<___;
 	ldr		$t0,[sp,#72]
 	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
 	teq		$inp,$t0
+	it		eq
 	subeq		$inp,$inp,#64		@ avoid SEGV
 	vld1.8		{@X[0]},[$inp]!		@ load next input block
 	vld1.8		{@X[1]},[$inp]!
 	vld1.8		{@X[2]},[$inp]!
 	vld1.8		{@X[3]},[$inp]!
+	it		ne
 	strne		$inp,[sp,#68]
 	mov		$Xfer,sp
 ___
@@ -526,10 +554,12 @@ $code.=<<___;
 	str	$D,[$t1],#4
 	stmia	$t1,{$E-$H}
 
+	ittte	ne
 	movne	$Xfer,sp
 	ldrne	$t1,[sp,#0]
 	eorne	$t2,$t2,$t2
 	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
 	eorne	$t3,$B,$C
 	bne	.L_00_48
 
@@ -548,13 +578,26 @@ my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
 my $Ktbl="r3";
 
 $code.=<<___;
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
 .type	sha256_block_data_order_armv8,%function
 .align	5
 sha256_block_data_order_armv8:
 .LARMv8:
 	vld1.32	{$ABCD,$EFGH},[$ctx]
-	sub	$Ktbl,r3,#sha256_block_data_order-K256
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 
 .Loop_v8:
 	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
@@ -607,6 +650,7 @@ $code.=<<___;
 
 	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
 	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
 	bne		.Loop_v8
 
 	vst1.32		{$ABCD,$EFGH},[$ctx]
@@ -619,11 +663,19 @@ ___
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .comm   OPENSSL_armcap_P,4,4
 #endif
 ___
 
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
 {   my  %opcode = (
 	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
 	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
@@ -638,7 +690,7 @@ ___
 	    # since ARMv7 instructions are always encoded little-endian.
 	    # correct solution is to use .inst directive, but older
 	    # assemblers don't implement it:-(
-	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 			$word&0xff,($word>>8)&0xff,
 			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
diff --git a/openssl/crypto/sha/asm/sha512-armv8.pl b/openssl/crypto/sha/asm/sha512-armv8.pl
index bd7a0a566..f7b36b986 100755
--- a/openssl/crypto/sha/asm/sha512-armv8.pl
+++ b/openssl/crypto/sha/asm/sha512-armv8.pl
@@ -14,8 +14,10 @@
 #
 #		SHA256-hw	SHA256(*)	SHA512
 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-# Cortex-A53	2.38		15.6 (+110%)	10.1 (+190%(***))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
 # 
 # (*)	Software SHA256 results are of lesser relevance, presented
 #	mostly for informational purposes.
@@ -25,7 +27,7 @@
 # (***)	Super-impressive coefficients over gcc-generated code are
 #	indication of some compiler "pathology", most notably code
 #	generated with -mgeneral-regs-only is significanty faster
-#	and lags behind assembly only by 50-90%.
+#	and the gap is only 40-90%.
 
 $flavour=shift;
 $output=shift;