diff options
Diffstat (limited to 'openssl/crypto/rc4/asm/rc4-parisc.pl')
-rw-r--r-- | openssl/crypto/rc4/asm/rc4-parisc.pl | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/openssl/crypto/rc4/asm/rc4-parisc.pl b/openssl/crypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 000000000..916506708 --- /dev/null +++ b/openssl/crypto/rc4/asm/rc4-parisc.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# RC4 for PA-RISC. + +# June 2009. +# +# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. +# For reference, [4x] unrolled loop is >40% faster than folded one. +# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement +# is believed to be not sufficient to justify the effort... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker + # [+ argument transfer] +$SZ=1; # defaults to RC4_CHAR +if (open CONF,"<${dir}../../opensslconf.h") { + while(<CONF>) { + if (m/#\s*define\s+RC4_INT\s+(.*)/) { + $SZ = ($1=~/char$/) ? 1 : 4; + last; + } + } + close CONF; +} + +if ($SZ==1) { # RC4_CHAR + $LD="ldb"; + $LDX="ldbx"; + $MKX="addl"; + $ST="stb"; +} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) + $LD="ldw"; + $LDX="ldwx,s"; + $MKX="sh2addl"; + $ST="stw"; +} + +$key="%r26"; +$len="%r25"; +$inp="%r24"; +$out="%r23"; + +@XX=("%r19","%r20"); +@TX=("%r21","%r22"); +$YY="%r28"; +$TY="%r29"; + +$acc="%r1"; +$ix="%r2"; +$iy="%r3"; +$dat0="%r4"; +$dat1="%r5"; +$rem="%r6"; +$mask="%r31"; + +sub unrolledloopbody { +for ($i=0;$i<4;$i++) { +$code.=<<___; + ldo 1($XX[0]),$XX[1] + `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` + and $mask,$XX[1],$XX[1] + $LDX $YY($key),$TY + $MKX $YY,$key,$ix + $LDX $XX[1]($key),$TX[1] + $MKX $XX[0],$key,$iy + $ST $TX[0],0($ix) + comclr,<> $XX[1],$YY,%r0 ; conditional + copy $TX[0],$TX[1] ; move + `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` + $ST $TY,0($iy) + addl $TX[0],$TY,$TY + addl $TX[1],$YY,$YY + and $mask,$TY,$TY + and $mask,$YY,$YY +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} } + +sub foldedloop { +my ($label,$count)=@_; +$code.=<<___; +$label + $MKX $YY,$key,$iy + $LDX $YY($key),$TY + $MKX $XX[0],$key,$ix + $ST $TX[0],0($iy) + ldo 1($XX[0]),$XX[0] + $ST $TY,0($ix) + addl $TX[0],$TY,$TY + ldbx $inp($out),$dat1 + and $mask,$TY,$TY + and $mask,$XX[0],$XX[0] + $LDX $TY($key),$acc + $LDX $XX[0]($key),$TX[0] + ldo 1($out),$out + xor $dat1,$acc,$acc + addl $TX[0],$YY,$YY + stb $acc,-1($out) + addib,<> -1,$count,$label ; $count is always small + and $mask,$YY,$YY +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR +RC4 + .PROC + .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + + cmpib,*= 0,$len,L\$abort + sub $inp,$out,$inp ; distance between $inp and $out + + $LD `0*$SZ`($key),$XX[0] + $LD `1*$SZ`($key),$YY + ldo `2*$SZ`($key),$key + + ldi 0xff,$mask + ldi 3,$dat0 + + ldo 1($XX[0]),$XX[0] ; warm up loop + and $mask,$XX[0],$XX[0] + $LDX $XX[0]($key),$TX[0] + addl $TX[0],$YY,$YY + cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? + and $mask,$YY,$YY + + and,<> $out,$dat0,$rem ; is $out aligned? + b L\$alignedout + subi 4,$rem,$rem + sub $len,$rem,$len +___ +&foldedloop("L\$alignout",$rem); # process till $out is aligned + +$code.=<<___; +L\$alignedout ; $len is at least 4 here + and,<> $inp,$dat0,$acc ; is $inp aligned? + b L\$oop4 + sub $inp,$acc,$rem ; align $inp + + sh3addl $acc,%r0,$acc + subi 32,$acc,$acc + mtctl $acc,%cr11 ; load %sar with vshd align factor + ldwx $rem($out),$dat0 + ldo 4($rem),$rem +L\$oop4misalignedinp +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $rem($out),$dat1 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + vshd $dat0,$dat1,$iy ; align data + copy $dat1,$dat0 + xor $iy,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4misalignedinp + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop + b L\$oop1 + nop + + .ALIGN 8 +L\$oop4 +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $inp($out),$dat0 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + xor $dat0,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4 + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop +___ +&foldedloop("L\$oop1",$len); +$code.=<<___; +L\$done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 + ldo -1($XX[0]),$XX[0] ; chill out loop + sub $YY,$TX[0],$YY + and $mask,$XX[0],$XX[0] + and $mask,$YY,$YY + $ST $XX[0],`-2*$SZ`($key) + $ST $YY,`-1*$SZ`($key) + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND +___ + +$code.=<<___; + + .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 8 +private_RC4_set_key + .PROC + .CALLINFO NO_CALLS + .ENTRY + $ST %r0,`0*$SZ`($key) + $ST %r0,`1*$SZ`($key) + ldo `2*$SZ`($key),$key + copy %r0,@XX[0] +L\$1st + $ST @XX[0],0($key) + ldo 1(@XX[0]),@XX[0] + bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 + ldo $SZ($key),$key + + ldo `-256*$SZ`($key),$key ; rewind $key + addl $len,$inp,$inp ; $inp to point at the end + sub %r0,$len,%r23 ; inverse index + copy %r0,@XX[0] + copy %r0,@XX[1] + ldi 0xff,$mask + +L\$2nd + $LDX @XX[0]($key),@TX[0] + ldbx %r23($inp),@TX[1] + addi,nuv 1,%r23,%r23 ; increment and conditional + sub %r0,$len,%r23 ; inverse index + addl @TX[0],@XX[1],@XX[1] + addl @TX[1],@XX[1],@XX[1] + and $mask,@XX[1],@XX[1] + $MKX @XX[0],$key,$TY + $LDX @XX[1]($key),@TX[1] + $MKX @XX[1],$key,$YY + ldo 1(@XX[0]),@XX[0] + $ST @TX[0],0($YY) + bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 + $ST @TX[1],0($TY) + + bv,n (%r2) + .EXIT + nop + .PROCEND + + .EXPORT RC4_options,ENTRY + .ALIGN 8 +RC4_options + .PROC + .CALLINFO NO_CALLS + .ENTRY + blr %r0,%r28 + ldi 3,%r1 +L\$pic + andcm %r28,%r1,%r28 + bv (%r2) + .EXIT + ldo L\$opts-L\$pic(%r28),%r28 + .PROCEND + .ALIGN 8 +L\$opts + .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" + .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" +___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); + +print $code; +close STDOUT; |