aboutsummaryrefslogtreecommitdiff
path: root/openssl/crypto/sha/asm/sha1-ppc.pl
blob: dcd0fcdfcfa20d8c6d32f9604fd3d5b7d6f9d68b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# I let hardware handle unaligned input(*), except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank. The module is big-endian [which is
# not big deal as there're no little-endian targets left around].
#
# (*) this means that this module is inappropriate for PPC403? Does
#     anybody know if pre-POWER3 can sustain unaligned load?

# 			-m64	-m32
# ----------------------------------
# PPC970,gcc-4.0.0	+76%	+59%
# Power6,xlc-7		+68%	+33%

$flavour = shift;

if ($flavour =~ /64/) {
	$SIZE_T	=8;
	$UCMP	="cmpld";
	$STU	="stdu";
	$POP	="ld";
	$PUSH	="std";
} elsif ($flavour =~ /32/) {
	$SIZE_T	=4;
	$UCMP	="cmplw";
	$STU	="stwu";
	$POP	="lwz";
	$PUSH	="stw";
} else { die "nonsense $flavour"; }

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";

open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";

$FRAME=24*$SIZE_T;

$K  ="r0";
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$t0 ="r15";
$t1 ="r6";

$A  ="r7";
$B  ="r8";
$C  ="r9";
$D  ="r10";
$E  ="r11";
$T  ="r12";

@V=($A,$B,$C,$D,$E,$T);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
    "r24","r25","r26","r27","r28","r29","r30","r31");

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
	lwz	@X[$i],`$i*4`($inp)
___
$code.=<<___ if ($i<15);
	lwz	@X[$j],`$j*4`($inp)
	add	$f,$K,$e
	rotlwi	$e,$a,5
	add	$f,$f,@X[$i]
	and	$t0,$c,$b
	add	$f,$f,$e
	andc	$t1,$d,$b
	rotlwi	$b,$b,30
	or	$t0,$t0,$t1
	add	$f,$f,$t0
___
$code.=<<___ if ($i>=15);
	add	$f,$K,$e
	rotlwi	$e,$a,5
	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
	add	$f,$f,@X[$i%16]
	and	$t0,$c,$b
	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
	add	$f,$f,$e
	andc	$t1,$d,$b
	rotlwi	$b,$b,30
	or	$t0,$t0,$t1
	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
	add	$f,$f,$t0
	rotlwi	@X[$j%16],@X[$j%16],1
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
	add	$f,$K,$e
	rotlwi	$e,$a,5
	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
	add	$f,$f,@X[$i%16]
	xor	$t0,$b,$c
	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
	add	$f,$f,$e
	rotlwi	$b,$b,30
	xor	$t0,$t0,$d
	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
	add	$f,$f,$t0
	rotlwi	@X[$j%16],@X[$j%16],1
___
$code.=<<___ if ($i==79);
	add	$f,$K,$e
	rotlwi	$e,$a,5
	lwz	r16,0($ctx)
	add	$f,$f,@X[$i%16]
	xor	$t0,$b,$c
	lwz	r17,4($ctx)
	add	$f,$f,$e
	rotlwi	$b,$b,30
	lwz	r18,8($ctx)
	xor	$t0,$t0,$d
	lwz	r19,12($ctx)
	add	$f,$f,$t0
	lwz	r20,16($ctx)
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___;
	add	$f,$K,$e
	rotlwi	$e,$a,5
	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
	add	$f,$f,@X[$i%16]
	and	$t0,$b,$c
	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
	add	$f,$f,$e
	or	$t1,$b,$c
	rotlwi	$b,$b,30
	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
	and	$t1,$t1,$d
	or	$t0,$t0,$t1
	rotlwi	@X[$j%16],@X[$j%16],1
	add	$f,$f,$t0
___
}

$code=<<___;
.machine	"any"
.text

.globl	.sha1_block_data_order
.align	4
.sha1_block_data_order:
	mflr	r0
	$STU	$sp,`-($FRAME+64)`($sp)
	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
	lwz	$A,0($ctx)
	lwz	$B,4($ctx)
	lwz	$C,8($ctx)
	lwz	$D,12($ctx)
	lwz	$E,16($ctx)
	andi.	r0,$inp,3
	bne	Lunaligned
Laligned:
	mtctr	$num
	bl	Lsha1_block_private
Ldone:
	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
	mtlr	r0
	addi	$sp,$sp,`$FRAME+64`
	blr
___

# PowerPC specification allows an implementation to be ill-behaved
# upon unaligned access which crosses page boundary. "Better safe
# than sorry" principle makes me treat it specially. But I don't
# look for particular offending word, but rather for 64-byte input
# block which crosses the boundary. Once found that block is aligned
# and hashed separately...
$code.=<<___;
.align	4
Lunaligned:
	subfic	$t1,$inp,4096
	andi.	$t1,$t1,4095	; distance to closest page boundary
	srwi.	$t1,$t1,6	; t1/=64
	beq	Lcross_page
	$UCMP	$num,$t1
	ble-	Laligned	; didn't cross the page boundary
	mtctr	$t1
	subfc	$num,$t1,$num
	bl	Lsha1_block_private
Lcross_page:
	li	$t1,16
	mtctr	$t1
	addi	r20,$sp,$FRAME	; spot below the frame
Lmemcpy:
	lbz	r16,0($inp)
	lbz	r17,1($inp)
	lbz	r18,2($inp)
	lbz	r19,3($inp)
	addi	$inp,$inp,4
	stb	r16,0(r20)
	stb	r17,1(r20)
	stb	r18,2(r20)
	stb	r19,3(r20)
	addi	r20,r20,4
	bdnz	Lmemcpy

	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
	li	$t1,1
	addi	$inp,$sp,$FRAME
	mtctr	$t1
	bl	Lsha1_block_private
	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
	addic.	$num,$num,-1
	bne-	Lunaligned
	b	Ldone
___

# This is private block function, which uses tailored calling
# interface, namely upon entry SHA_CTX is pre-loaded to given
# registers and counter register contains amount of chunks to
# digest...
$code.=<<___;
.align	4
Lsha1_block_private:
___
$code.=<<___;	# load K_00_19
	lis	$K,0x5a82
	ori	$K,$K,0x7999
___
for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;	# load K_20_39
	lis	$K,0x6ed9
	ori	$K,$K,0xeba1
___
for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;	# load K_40_59
	lis	$K,0x8f1b
	ori	$K,$K,0xbcdc
___
for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;	# load K_60_79
	lis	$K,0xca62
	ori	$K,$K,0xc1d6
___
for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	add	r16,r16,$E
	add	r17,r17,$T
	add	r18,r18,$A
	add	r19,r19,$B
	add	r20,r20,$C
	stw	r16,0($ctx)
	mr	$A,r16
	stw	r17,4($ctx)
	mr	$B,r17
	stw	r18,8($ctx)
	mr	$C,r18
	stw	r19,12($ctx)
	mr	$D,r19
	stw	r20,16($ctx)
	mr	$E,r20
	addi	$inp,$inp,`16*4`
	bdnz-	Lsha1_block_private
	blr
___
$code.=<<___;
.asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;