diff options
author | marha <marha@users.sourceforge.net> | 2011-01-23 20:17:10 +0000 |
---|---|---|
committer | marha <marha@users.sourceforge.net> | 2011-01-23 20:17:10 +0000 |
commit | 1b639dab951fb73d5031aa0c1afb13e8480d1dae (patch) | |
tree | 4e13a4d1e4c609bd01d0bf5d65c6a8acdc3d0c50 /openssl/crypto/sha/asm | |
parent | 51181fb7f4d135d214974bb6611d51f21475eec8 (diff) | |
parent | b680cf39ed5bc37e0eb7eb86ad8599bf92df3f2b (diff) | |
download | vcxsrv-1b639dab951fb73d5031aa0c1afb13e8480d1dae.tar.gz vcxsrv-1b639dab951fb73d5031aa0c1afb13e8480d1dae.tar.bz2 vcxsrv-1b639dab951fb73d5031aa0c1afb13e8480d1dae.zip |
svn merge ^/branches/released .
Diffstat (limited to 'openssl/crypto/sha/asm')
-rw-r--r-- | openssl/crypto/sha/asm/sha1-armv4-large.pl | 76 | ||||
-rw-r--r-- | openssl/crypto/sha/asm/sha1-sparcv9.pl | 1 | ||||
-rw-r--r-- | openssl/crypto/sha/asm/sha1-sparcv9a.pl | 1 | ||||
-rw-r--r-- | openssl/crypto/sha/asm/sha256-armv4.pl | 33 | ||||
-rw-r--r-- | openssl/crypto/sha/asm/sha512-armv4.pl | 32 | ||||
-rw-r--r-- | openssl/crypto/sha/asm/sha512-sparcv9.pl | 1 |
6 files changed, 75 insertions, 69 deletions
diff --git a/openssl/crypto/sha/asm/sha1-armv4-large.pl b/openssl/crypto/sha/asm/sha1-armv4-large.pl index 88861af64..6e65fe3e0 100644 --- a/openssl/crypto/sha/asm/sha1-armv4-large.pl +++ b/openssl/crypto/sha/asm/sha1-armv4-large.pl @@ -37,9 +37,18 @@ # modes are limited. As result it takes more instructions to do # the same job in Thumb, therefore the code is never twice as # small and always slower. -# [***] which is also ~35% better than compiler generated code. +# [***] which is also ~35% better than compiler generated code. Dual- +# issue Cortex A8 core was measured to process input block in +# ~990 cycles. -$output=shift; +# August 2010. +# +# Rescheduling for dual-issue pipeline resulted in 13% improvement on +# Cortex A8 core and in absolute terms ~870 cycles per input block +# [or 13.6 cycles per byte]. + + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; @@ -58,43 +67,22 @@ $t3="r12"; $Xi="r14"; @V=($a,$b,$c,$d,$e); -# One can optimize this for aligned access on big-endian architecture, -# but code's endian neutrality makes it too pretty:-) -sub Xload { -my ($a,$b,$c,$d,$e)=@_; -$code.=<<___; - ldrb $t0,[$inp],#4 - ldrb $t1,[$inp,#-3] - ldrb $t2,[$inp,#-2] - ldrb $t3,[$inp,#-1] - add $e,$K,$e,ror#2 @ E+=K_00_19 - orr $t0,$t1,$t0,lsl#8 - add $e,$e,$a,ror#27 @ E+=ROR(A,27) - orr $t0,$t2,$t0,lsl#8 - eor $t1,$c,$d @ F_xx_xx - orr $t0,$t3,$t0,lsl#8 - add $e,$e,$t0 @ E+=X[i] - str $t0,[$Xi,#-4]! -___ -} sub Xupdate { -my ($a,$b,$c,$d,$e,$flag)=@_; +my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; $code.=<<___; ldr $t0,[$Xi,#15*4] ldr $t1,[$Xi,#13*4] ldr $t2,[$Xi,#7*4] - ldr $t3,[$Xi,#2*4] add $e,$K,$e,ror#2 @ E+=K_xx_xx + ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 - eor $t0,$t0,$t2 - eor $t0,$t0,$t3 - add $e,$e,$a,ror#27 @ E+=ROR(A,27) -___ -$code.=<<___ if (!defined($flag)); - eor $t1,$c,$d @ F_xx_xx, but not in 40_59 -___ -$code.=<<___; + eor $t2,$t2,$t3 + eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + eor $t0,$t0,$t2,ror#31 + $opt1 @ F_xx_xx + $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ @@ -102,19 +90,29 @@ ___ sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; - &Xload(@_); $code.=<<___; + ldrb $t0,[$inp],#4 + ldrb $t1,[$inp,#-1] + ldrb $t2,[$inp,#-2] + add $e,$K,$e,ror#2 @ E+=K_00_19 + ldrb $t3,[$inp,#-3] + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + orr $t0,$t1,$t0,lsl#24 + eor $t1,$c,$d @ F_xx_xx + orr $t0,$t0,$t2,lsl#8 + orr $t0,$t0,$t3,lsl#16 and $t1,$b,$t1,ror#2 + add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) + str $t0,[$Xi,#-4]! add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ } sub BODY_16_19 { my ($a,$b,$c,$d,$e)=@_; - &Xupdate(@_); + &Xupdate(@_,"and $t1,$b,$t1,ror#2"); $code.=<<___; - and $t1,$b,$t1,ror#2 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ @@ -122,22 +120,18 @@ ___ sub BODY_20_39 { my ($a,$b,$c,$d,$e)=@_; - &Xupdate(@_); + &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); $code.=<<___; - eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) add $e,$e,$t1 @ E+=F_20_39(B,C,D) ___ } sub BODY_40_59 { my ($a,$b,$c,$d,$e)=@_; - &Xupdate(@_,1); + &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); $code.=<<___; - and $t1,$b,$c,ror#2 - orr $t2,$b,$c,ror#2 - and $t2,$t2,$d,ror#2 - orr $t1,$t1,$t2 @ F_40_59(B,C,D) add $e,$e,$t1 @ E+=F_40_59(B,C,D) + add $e,$e,$t2,ror#2 ___ } diff --git a/openssl/crypto/sha/asm/sha1-sparcv9.pl b/openssl/crypto/sha/asm/sha1-sparcv9.pl index 8306fc88c..5c161cecd 100644 --- a/openssl/crypto/sha/asm/sha1-sparcv9.pl +++ b/openssl/crypto/sha/asm/sha1-sparcv9.pl @@ -276,6 +276,7 @@ $code.=<<___; .type sha1_block_data_order,#function .size sha1_block_data_order,(.-sha1_block_data_order) .asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/openssl/crypto/sha/asm/sha1-sparcv9a.pl b/openssl/crypto/sha/asm/sha1-sparcv9a.pl index 15eb854ba..85e8d6808 100644 --- a/openssl/crypto/sha/asm/sha1-sparcv9a.pl +++ b/openssl/crypto/sha/asm/sha1-sparcv9a.pl @@ -539,6 +539,7 @@ $code.=<<___; .type sha1_block_data_order,#function .size sha1_block_data_order,(.-sha1_block_data_order) .asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 ___ # Purpose of these subroutines is to explicitly encode VIS instructions, diff --git a/openssl/crypto/sha/asm/sha256-armv4.pl b/openssl/crypto/sha/asm/sha256-armv4.pl index 48d846dee..492cb62bc 100644 --- a/openssl/crypto/sha/asm/sha256-armv4.pl +++ b/openssl/crypto/sha/asm/sha256-armv4.pl @@ -11,9 +11,14 @@ # Performance is ~2x better than gcc 3.4 generated code and in "abso- # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -# byte. +# byte [on single-issue Xscale PXA250 core]. -$output=shift; +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; $t0="r0"; @@ -52,27 +57,27 @@ $code.=<<___ if ($i<16); ___ $code.=<<___; ldr $t2,[$Ktbl],#4 @ *K256++ - str $T1,[sp,#`$i%16`*4] mov $t0,$e,ror#$Sigma1[0] + str $T1,[sp,#`$i%16`*4] eor $t0,$t0,$e,ror#$Sigma1[1] - eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) - add $T1,$T1,$t0 eor $t1,$f,$g + eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) and $t1,$t1,$e + add $T1,$T1,$t0 eor $t1,$t1,$g @ Ch(e,f,g) - add $T1,$T1,$t1 add $T1,$T1,$h - add $T1,$T1,$t2 mov $h,$a,ror#$Sigma0[0] + add $T1,$T1,$t1 eor $h,$h,$a,ror#$Sigma0[1] + add $T1,$T1,$t2 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) orr $t0,$a,$b - and $t0,$t0,$c and $t1,$a,$b + and $t0,$t0,$c + add $h,$h,$T1 orr $t0,$t0,$t1 @ Maj(a,b,c) - add $h,$h,$t0 add $d,$d,$T1 - add $h,$h,$T1 + add $h,$h,$t0 ___ } @@ -80,19 +85,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - ldr $t1,[sp,#`($i+1)%16`*4] @ $i + ldr $t1,[sp,#`($i+1)%16`*4] @ $i ldr $t2,[sp,#`($i+14)%16`*4] ldr $T1,[sp,#`($i+0)%16`*4] - ldr $inp,[sp,#`($i+9)%16`*4] mov $t0,$t1,ror#$sigma0[0] + ldr $inp,[sp,#`($i+9)%16`*4] eor $t0,$t0,$t1,ror#$sigma0[1] eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) mov $t1,$t2,ror#$sigma1[0] + add $T1,$T1,$t0 eor $t1,$t1,$t2,ror#$sigma1[1] + add $T1,$T1,$inp eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) - add $T1,$T1,$t0 add $T1,$T1,$t1 - add $T1,$T1,$inp ___ &BODY_00_15(@_); } diff --git a/openssl/crypto/sha/asm/sha512-armv4.pl b/openssl/crypto/sha/asm/sha512-armv4.pl index 4fbb94a91..3a35861ac 100644 --- a/openssl/crypto/sha/asm/sha512-armv4.pl +++ b/openssl/crypto/sha/asm/sha512-armv4.pl @@ -10,7 +10,13 @@ # SHA512 block procedure for ARMv4. September 2007. # This code is ~4.5 (four and a half) times faster than code generated -# by gcc 3.4 and it spends ~72 clock cycles per byte. +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. # Byte order [in]dependence. ========================================= # @@ -22,7 +28,7 @@ $hi=0; $lo=4; # ==================================================================== -$output=shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; @@ -73,33 +79,31 @@ $code.=<<___; eor $t0,$t0,$Elo,lsl#23 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) adds $Tlo,$Tlo,$t0 - adc $Thi,$Thi,$t1 @ T += Sigma1(e) - adds $Tlo,$Tlo,$t2 - adc $Thi,$Thi,$t3 @ T += h - ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h ldr $t3,[sp,#$Goff+4] @ g.hi - str $Elo,[sp,#$Eoff+0] - str $Ehi,[sp,#$Eoff+4] - str $Alo,[sp,#$Aoff+0] - str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 - eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t2,[$Ktbl,#4] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) ldr $t3,[$Ktbl,#0] @ K[i].hi - ldr $Elo,[sp,#$Doff+0] @ d.lo - ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t2 adc $Thi,$Thi,$t3 @ T += K[i] adds $Elo,$Elo,$Tlo diff --git a/openssl/crypto/sha/asm/sha512-sparcv9.pl b/openssl/crypto/sha/asm/sha512-sparcv9.pl index 54241aab5..ec5d78135 100644 --- a/openssl/crypto/sha/asm/sha512-sparcv9.pl +++ b/openssl/crypto/sha/asm/sha512-sparcv9.pl @@ -586,6 +586,7 @@ $code.=<<___; .type sha${label}_block_data_order,#function .size sha${label}_block_data_order,(.-sha${label}_block_data_order) .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; |