diff options
author | marha <marha@users.sourceforge.net> | 2009-06-28 22:07:26 +0000 |
---|---|---|
committer | marha <marha@users.sourceforge.net> | 2009-06-28 22:07:26 +0000 |
commit | 3562e78743202e43aec8727005182a2558117eca (patch) | |
tree | 8f9113a77d12470c5c851a2a8e4cb02e89df7d43 /openssl/crypto/rc4/asm/rc4-586.pl | |
download | vcxsrv-3562e78743202e43aec8727005182a2558117eca.tar.gz vcxsrv-3562e78743202e43aec8727005182a2558117eca.tar.bz2 vcxsrv-3562e78743202e43aec8727005182a2558117eca.zip |
Checked in the following released items:
xkeyboard-config-1.4.tar.gz
ttf-bitstream-vera-1.10.tar.gz
font-alias-1.0.1.tar.gz
font-sun-misc-1.0.0.tar.gz
font-sun-misc-1.0.0.tar.gz
font-sony-misc-1.0.0.tar.gz
font-schumacher-misc-1.0.0.tar.gz
font-mutt-misc-1.0.0.tar.gz
font-misc-misc-1.0.0.tar.gz
font-misc-meltho-1.0.0.tar.gz
font-micro-misc-1.0.0.tar.gz
font-jis-misc-1.0.0.tar.gz
font-isas-misc-1.0.0.tar.gz
font-dec-misc-1.0.0.tar.gz
font-daewoo-misc-1.0.0.tar.gz
font-cursor-misc-1.0.0.tar.gz
font-arabic-misc-1.0.0.tar.gz
font-winitzki-cyrillic-1.0.0.tar.gz
font-misc-cyrillic-1.0.0.tar.gz
font-cronyx-cyrillic-1.0.0.tar.gz
font-screen-cyrillic-1.0.1.tar.gz
font-xfree86-type1-1.0.1.tar.gz
font-adobe-utopia-type1-1.0.1.tar.gz
font-ibm-type1-1.0.0.tar.gz
font-bitstream-type1-1.0.0.tar.gz
font-bitstream-speedo-1.0.0.tar.gz
font-bh-ttf-1.0.0.tar.gz
font-bh-type1-1.0.0.tar.gz
font-bitstream-100dpi-1.0.0.tar.gz
font-bh-lucidatypewriter-100dpi-1.0.0.tar.gz
font-bh-100dpi-1.0.0.tar.gz
font-adobe-utopia-100dpi-1.0.1.tar.gz
font-adobe-100dpi-1.0.0.tar.gz
font-util-1.0.1.tar.gz
font-bitstream-75dpi-1.0.0.tar.gz
font-bh-lucidatypewriter-75dpi-1.0.0.tar.gz
font-adobe-utopia-75dpi-1.0.1.tar.gz
font-bh-75dpi-1.0.0.tar.gz
bdftopcf-1.0.1.tar.gz
font-adobe-75dpi-1.0.0.tar.gz
mkfontscale-1.0.6.tar.gz
openssl-0.9.8k.tar.gz
bigreqsproto-1.0.2.tar.gz
xtrans-1.2.2.tar.gz
resourceproto-1.0.2.tar.gz
inputproto-1.4.4.tar.gz
compositeproto-0.4.tar.gz
damageproto-1.1.0.tar.gz
zlib-1.2.3.tar.gz
xkbcomp-1.0.5.tar.gz
freetype-2.3.9.tar.gz
pthreads-w32-2-8-0-release.tar.gz
pixman-0.12.0.tar.gz
kbproto-1.0.3.tar.gz
evieext-1.0.2.tar.gz
fixesproto-4.0.tar.gz
recordproto-1.13.2.tar.gz
randrproto-1.2.2.tar.gz
scrnsaverproto-1.1.0.tar.gz
renderproto-0.9.3.tar.gz
xcmiscproto-1.1.2.tar.gz
fontsproto-2.0.2.tar.gz
xextproto-7.0.3.tar.gz
xproto-7.0.14.tar.gz
libXdmcp-1.0.2.tar.gz
libxkbfile-1.0.5.tar.gz
libfontenc-1.0.4.tar.gz
libXfont-1.3.4.tar.gz
libX11-1.1.5.tar.gz
libXau-1.0.4.tar.gz
libxcb-1.1.tar.gz
xorg-server-1.5.3.tar.gz
Diffstat (limited to 'openssl/crypto/rc4/asm/rc4-586.pl')
-rw-r--r-- | openssl/crypto/rc4/asm/rc4-586.pl | 230 |
1 files changed, 230 insertions, 0 deletions
diff --git a/openssl/crypto/rc4/asm/rc4-586.pl b/openssl/crypto/rc4/asm/rc4-586.pl new file mode 100644 index 000000000..ef7eee766 --- /dev/null +++ b/openssl/crypto/rc4/asm/rc4-586.pl @@ -0,0 +1,230 @@ +#!/usr/local/bin/perl + +# At some point it became apparent that the original SSLeay RC4 +# assembler implementation performs suboptimaly on latest IA-32 +# microarchitectures. After re-tuning performance has changed as +# following: +# +# Pentium +0% +# Pentium III +17% +# AMD +52%(*) +# P4 +180%(**) +# +# (*) This number is actually a trade-off:-) It's possible to +# achieve +72%, but at the cost of -48% off PIII performance. +# In other words code performing further 13% faster on AMD +# would perform almost 2 times slower on Intel PIII... +# For reference! This code delivers ~80% of rc4-amd64.pl +# performance on the same Opteron machine. +# (**) This number requires compressed key schedule set up by +# RC4_set_key and therefore doesn't apply to 0.9.7 [option for +# compressed key schedule is implemented in 0.9.8 and later, +# see commentary section in rc4_skey.c for further details]. +# +# <appro@fy.chalmers.se> + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"rc4-586.pl"); + +$x="eax"; +$y="ebx"; +$tx="ecx"; +$ty="edx"; +$in="esi"; +$out="edi"; +$d="ebp"; + +&RC4("RC4"); + +&asm_finish(); + +sub RC4_loop + { + local($n,$p,$char)=@_; + + &comment("Round $n"); + + if ($char) + { + if ($p >= 0) + { + &mov($ty, &swtmp(2)); + &cmp($ty, $in); + &jbe(&label("finished")); + &inc($in); + } + else + { + &add($ty, 8); + &inc($in); + &cmp($ty, $in); + &jb(&label("finished")); + &mov(&swtmp(2), $ty); + } + } + # Moved out + # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; + + &add( &LB($y), &LB($tx)); + &mov( $ty, &DWP(0,$d,$y,4)); + # XXX + &mov( &DWP(0,$d,$x,4),$ty); + &add( $ty, $tx); + &mov( &DWP(0,$d,$y,4),$tx); + &and( $ty, 0xff); + &inc( &LB($x)); # NEXT ROUND + &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND + &mov( $ty, &DWP(0,$d,$ty,4)); + + if (!$char) + { + #moved up into last round + if ($p >= 1) + { + &add( $out, 8) + } + &movb( &BP($n,"esp","",0), &LB($ty)); + } + else + { + # Note in+=8 has occured + &movb( &HB($ty), &BP(-1,$in,"",0)); + # XXX + &xorb(&LB($ty), &HB($ty)); + # XXX + &movb(&BP($n,$out,"",0),&LB($ty)); + } + } + + +sub RC4 + { + local($name)=@_; + + &function_begin_B($name,""); + + &mov($ty,&wparam(1)); # len + &cmp($ty,0); + &jne(&label("proceed")); + &ret(); + &set_label("proceed"); + + &comment(""); + + &push("ebp"); + &push("ebx"); + &push("esi"); + &xor( $x, $x); # avoid partial register stalls + &push("edi"); + &xor( $y, $y); # avoid partial register stalls + &mov( $d, &wparam(0)); # key + &mov( $in, &wparam(2)); + + &movb( &LB($x), &BP(0,$d,"",1)); + &movb( &LB($y), &BP(4,$d,"",1)); + + &mov( $out, &wparam(3)); + &inc( &LB($x)); + + &stack_push(3); # 3 temp variables + &add( $d, 8); + + # detect compressed schedule, see commentary section in rc4_skey.c... + # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, + # as compressed key schedule is set up in 0.9.8 and later. + &cmp(&DWP(256,$d),-1); + &je(&label("RC4_CHAR")); + + &lea( $ty, &DWP(-8,$ty,$in)); + + # check for 0 length input + + &mov( &swtmp(2), $ty); # this is now address to exit at + &mov( $tx, &DWP(0,$d,$x,4)); + + &cmp( $ty, $in); + &jb( &label("end")); # less than 8 bytes + + &set_label("start"); + + # filling DELAY SLOT + &add( $in, 8); + + &RC4_loop(0,-1,0); + &RC4_loop(1,0,0); + &RC4_loop(2,0,0); + &RC4_loop(3,0,0); + &RC4_loop(4,0,0); + &RC4_loop(5,0,0); + &RC4_loop(6,0,0); + &RC4_loop(7,1,0); + + &comment("apply the cipher text"); + # xor the cipher data with input + + #&add( $out, 8); #moved up into last round + + &mov( $tx, &swtmp(0)); + &mov( $ty, &DWP(-8,$in,"",0)); + &xor( $tx, $ty); + &mov( $ty, &DWP(-4,$in,"",0)); + &mov( &DWP(-8,$out,"",0), $tx); + &mov( $tx, &swtmp(1)); + &xor( $tx, $ty); + &mov( $ty, &swtmp(2)); # load end ptr; + &mov( &DWP(-4,$out,"",0), $tx); + &mov( $tx, &DWP(0,$d,$x,4)); + &cmp($in, $ty); + &jbe(&label("start")); + + &set_label("end"); + + # There is quite a bit of extra crap in RC4_loop() for this + # first round + &RC4_loop(0,-1,1); + &RC4_loop(1,0,1); + &RC4_loop(2,0,1); + &RC4_loop(3,0,1); + &RC4_loop(4,0,1); + &RC4_loop(5,0,1); + &RC4_loop(6,1,1); + + &jmp(&label("finished")); + + &align(16); + # this is essentially Intel P4 specific codepath, see rc4_skey.c, + # and is engaged in 0.9.8 and later context... + &set_label("RC4_CHAR"); + + &lea ($ty,&DWP(0,$in,$ty)); + &mov (&swtmp(2),$ty); + &movz ($tx,&BP(0,$d,$x)); + + # strangely enough unrolled loop performs over 20% slower... + &set_label("RC4_CHAR_loop"); + &add (&LB($y),&LB($tx)); + &movz ($ty,&BP(0,$d,$y)); + &movb (&BP(0,$d,$y),&LB($tx)); + &movb (&BP(0,$d,$x),&LB($ty)); + &add (&LB($ty),&LB($tx)); + &movz ($ty,&BP(0,$d,$ty)); + &add (&LB($x),1); + &xorb (&LB($ty),&BP(0,$in)); + &lea ($in,&DWP(1,$in)); + &movz ($tx,&BP(0,$d,$x)); + &cmp ($in,&swtmp(2)); + &movb (&BP(0,$out),&LB($ty)); + &lea ($out,&DWP(1,$out)); + &jb (&label("RC4_CHAR_loop")); + + &set_label("finished"); + &dec( $x); + &stack_pop(3); + &movb( &BP(-4,$d,"",0),&LB($y)); + &movb( &BP(-8,$d,"",0),&LB($x)); + + &function_end($name); + } + |