diff options
Diffstat (limited to 'openssl/crypto/bn/asm/mips3.s')
-rw-r--r-- | openssl/crypto/bn/asm/mips3.s | 2201 |
1 files changed, 0 insertions, 2201 deletions
diff --git a/openssl/crypto/bn/asm/mips3.s b/openssl/crypto/bn/asm/mips3.s deleted file mode 100644 index dca4105c7..000000000 --- a/openssl/crypto/bn/asm/mips3.s +++ /dev/null @@ -1,2201 +0,0 @@ -.rdata -.asciiz "mips3.s, Version 1.1" -.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" - -/* - * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. - * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. - * ==================================================================== - */ - -/* - * This is my modest contributon to the OpenSSL project (see - * http://www.openssl.org/ for more information about it) and is - * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c - * module. For updates see http://fy.chalmers.se/~appro/hpe/. - * - * The module is designed to work with either of the "new" MIPS ABI(5), - * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under - * IRIX 5.x not only because it doesn't support new ABIs but also - * because 5.x kernels put R4x00 CPU into 32-bit mode and all those - * 64-bit instructions (daddu, dmultu, etc.) found below gonna only - * cause illegal instruction exception:-( - * - * In addition the code depends on preprocessor flags set up by MIPSpro - * compiler driver (either as or cc) and therefore (probably?) can't be - * compiled by the GNU assembler. GNU C driver manages fine though... - * I mean as long as -mmips-as is specified or is the default option, - * because then it simply invokes /usr/bin/as which in turn takes - * perfect care of the preprocessor definitions. Another neat feature - * offered by the MIPSpro assembler is an optimization pass. This gave - * me the opportunity to have the code looking more regular as all those - * architecture dependent instruction rescheduling details were left to - * the assembler. Cool, huh? - * - * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' - * goes way over 3 times faster! - * - * <appro@fy.chalmers.se> - */ -#include <asm.h> -#include <regdef.h> - -#if _MIPS_ISA>=4 -#define MOVNZ(cond,dst,src) \ - movn dst,src,cond -#else -#define MOVNZ(cond,dst,src) \ - .set noreorder; \ - bnezl cond,.+8; \ - move dst,src; \ - .set reorder -#endif - -.text - -.set noat -.set reorder - -#define MINUS4 v1 - -.align 5 -LEAF(bn_mul_add_words) - .set noreorder - bgtzl a2,.L_bn_mul_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_add_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_add_words_tail - -.L_bn_mul_add_words_loop: - dmultu t0,a3 - ld t1,0(a0) - ld t2,8(a1) - ld t3,8(a0) - ld ta0,16(a1) - ld ta1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 /* All manuals say it "compares 32-bit - * values", but it seems to work fine - * even on 64-bit registers. */ - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - - dmultu t2,a3 - ld ta2,24(a1) - ld ta3,24(a0) - daddu t3,v0 - sltu v0,t3,v0 - mflo AT - mfhi t2 - daddu t3,AT - daddu v0,t2 - sltu AT,t3,AT - sd t3,8(a0) - daddu v0,AT - - dmultu ta0,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - daddu ta1,v0 - sltu v0,ta1,v0 - mflo AT - mfhi ta0 - daddu ta1,AT - daddu v0,ta0 - sltu AT,ta1,AT - sd ta1,-16(a0) - daddu v0,AT - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - daddu ta3,v0 - sltu v0,ta3,v0 - mflo AT - mfhi ta2 - daddu ta3,AT - daddu v0,ta2 - sltu AT,ta3,AT - sd ta3,-8(a0) - daddu v0,AT - .set noreorder - bgtzl ta0,.L_bn_mul_add_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_add_words_return: - jr ra - -.L_bn_mul_add_words_tail: - dmultu t0,a3 - ld t1,0(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,8(a1) - dmultu t0,a3 - ld t1,8(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,8(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,16(a1) - dmultu t0,a3 - ld t1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,16(a0) - daddu v0,AT - jr ra -END(bn_mul_add_words) - -.align 5 -LEAF(bn_mul_words) - .set noreorder - bgtzl a2,.L_bn_mul_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_words_tail - -.L_bn_mul_words_loop: - dmultu t0,a3 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - - dmultu t2,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - mflo AT - mfhi t2 - daddu v0,AT - sltu t3,v0,AT - sd v0,-24(a0) - daddu v0,t3,t2 - - dmultu ta0,a3 - mflo AT - mfhi ta0 - daddu v0,AT - sltu ta1,v0,AT - sd v0,-16(a0) - daddu v0,ta1,ta0 - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - mflo AT - mfhi ta2 - daddu v0,AT - sltu ta3,v0,AT - sd v0,-8(a0) - daddu v0,ta3,ta2 - .set noreorder - bgtzl ta0,.L_bn_mul_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_words_return: - jr ra - -.L_bn_mul_words_tail: - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,8(a1) - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,8(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,16(a1) - dmultu t0,a3 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,16(a0) - daddu v0,t1,t0 - jr ra -END(bn_mul_words) - -.align 5 -LEAF(bn_sqr_words) - .set noreorder - bgtzl a2,.L_bn_sqr_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sqr_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_sqr_words_tail - -.L_bn_sqr_words_loop: - dmultu t0,t0 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - - dmultu t2,t2 - subu a2,4 - PTR_ADD a0,64 - PTR_ADD a1,32 - mflo t3 - mfhi t2 - sd t3,-48(a0) - sd t2,-40(a0) - - dmultu ta0,ta0 - mflo ta1 - mfhi ta0 - sd ta1,-32(a0) - sd ta0,-24(a0) - - - dmultu ta2,ta2 - and ta0,a2,MINUS4 - mflo ta3 - mfhi ta2 - sd ta3,-16(a0) - sd ta2,-8(a0) - - .set noreorder - bgtzl ta0,.L_bn_sqr_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_sqr_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sqr_words_return: - move v0,zero - jr ra - -.L_bn_sqr_words_tail: - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,8(a1) - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,16(a0) - sd t0,24(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,16(a1) - dmultu t0,t0 - mflo t1 - mfhi t0 - sd t1,32(a0) - sd t0,40(a0) - jr ra -END(bn_sqr_words) - -.align 5 -LEAF(bn_add_words) - .set noreorder - bgtzl a3,.L_bn_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_add_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_add_words_tail - -.L_bn_add_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - daddu ta0,t0 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,-32(a0) - daddu v0,t8 - - daddu ta1,t1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,-24(a0) - daddu v0,t9 - - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,-16(a0) - daddu v0,t8 - - daddu ta3,t3 - sltu t9,ta3,t3 - daddu t3,ta3,v0 - sltu v0,t3,ta3 - sd t3,-8(a0) - daddu v0,t9 - - .set noreorder - bgtzl AT,.L_bn_add_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_add_words_return: - jr ra - -.L_bn_add_words_tail: - ld ta0,0(a2) - daddu ta0,t0 - subu a3,1 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,0(a0) - daddu v0,t8 - beqz a3,.L_bn_add_words_return - - ld t1,8(a1) - ld ta1,8(a2) - daddu ta1,t1 - subu a3,1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,8(a0) - daddu v0,t9 - beqz a3,.L_bn_add_words_return - - ld t2,16(a1) - ld ta2,16(a2) - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,16(a0) - daddu v0,t8 - jr ra -END(bn_add_words) - -.align 5 -LEAF(bn_sub_words) - .set noreorder - bgtzl a3,.L_bn_sub_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sub_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_sub_words_tail - -.L_bn_sub_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - sd ta0,-32(a0) - MOVNZ (t0,v0,t8) - - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - sd ta1,-24(a0) - MOVNZ (t1,v0,t9) - - - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - sd ta2,-16(a0) - MOVNZ (t2,v0,t8) - - sltu t9,t3,ta3 - dsubu t3,ta3 - dsubu ta3,t3,v0 - sd ta3,-8(a0) - MOVNZ (t3,v0,t9) - - .set noreorder - bgtzl AT,.L_bn_sub_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_sub_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sub_words_return: - jr ra - -.L_bn_sub_words_tail: - ld ta0,0(a2) - subu a3,1 - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - MOVNZ (t0,v0,t8) - sd ta0,0(a0) - beqz a3,.L_bn_sub_words_return - - ld t1,8(a1) - subu a3,1 - ld ta1,8(a2) - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - MOVNZ (t1,v0,t9) - sd ta1,8(a0) - beqz a3,.L_bn_sub_words_return - - ld t2,16(a1) - ld ta2,16(a2) - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - MOVNZ (t2,v0,t8) - sd ta2,16(a0) - jr ra -END(bn_sub_words) - -#undef MINUS4 - -.align 5 -LEAF(bn_div_3_words) - .set reorder - move a3,a0 /* we know that bn_div_words doesn't - * touch a3, ta2, ta3 and preserves a2 - * so that we can save two arguments - * and return address in registers - * instead of stack:-) - */ - ld a0,(a3) - move ta2,a1 - ld a1,-8(a3) - bne a0,a2,.L_bn_div_3_words_proceed - li v0,-1 - jr ra -.L_bn_div_3_words_proceed: - move ta3,ra - bal bn_div_words - move ra,ta3 - dmultu ta2,v0 - ld t2,-16(a3) - move ta0,zero - mfhi t1 - mflo t0 - sltu t8,t1,v1 -.L_bn_div_3_words_inner_loop: - bnez t8,.L_bn_div_3_words_inner_loop_done - sgeu AT,t2,t0 - seq t9,t1,v1 - and AT,t9 - sltu t3,t0,ta2 - daddu v1,a2 - dsubu t1,t3 - dsubu t0,ta2 - sltu t8,t1,v1 - sltu ta0,v1,a2 - or t8,ta0 - .set noreorder - beqzl AT,.L_bn_div_3_words_inner_loop - dsubu v0,1 - .set reorder -.L_bn_div_3_words_inner_loop_done: - jr ra -END(bn_div_3_words) - -.align 5 -LEAF(bn_div_words) - .set noreorder - bnezl a2,.L_bn_div_words_proceed - move v1,zero - jr ra - li v0,-1 /* I'd rather signal div-by-zero - * which can be done with 'break 7' */ - -.L_bn_div_words_proceed: - bltz a2,.L_bn_div_words_body - move t9,v1 - dsll a2,1 - bgtz a2,.-4 - addu t9,1 - - .set reorder - negu t1,t9 - li t2,-1 - dsll t2,t1 - and t2,a0 - dsrl AT,a1,t1 - .set noreorder - bnezl t2,.+8 - break 6 /* signal overflow */ - .set reorder - dsll a0,t9 - dsll a1,t9 - or a0,AT - -#define QT ta0 -#define HH ta1 -#define DH v1 -.L_bn_div_words_body: - dsrl DH,a2,32 - sgeu AT,a0,a2 - .set noreorder - bnezl AT,.+8 - dsubu a0,a2 - .set reorder - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div1 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div1: - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop1: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v0,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop1_done - dsubu t1,v0 - dsubu t0,a2 - b .L_bn_div_words_inner_loop1 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop1_done: - - dsll a1,32 - dsubu a0,t3,t0 - dsll v0,QT,32 - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div2 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div2: -#undef DH - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop2: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v1,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop2_done - dsubu t1,v1 - dsubu t0,a2 - b .L_bn_div_words_inner_loop2 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop2_done: -#undef HH - - dsubu a0,t3,t0 - or v0,QT - dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ - dsrl a2,t9 /* restore a2 */ - jr ra -#undef QT -END(bn_div_words) - -#define a_0 t0 -#define a_1 t1 -#define a_2 t2 -#define a_3 t3 -#define b_0 ta0 -#define b_1 ta1 -#define b_2 ta2 -#define b_3 ta3 - -#define a_4 s0 -#define a_5 s2 -#define a_6 s4 -#define a_7 a1 /* once we load a[7] we don't need a anymore */ -#define b_4 s1 -#define b_5 s3 -#define b_6 s5 -#define b_7 a2 /* once we load b[7] we don't need b anymore */ - -#define t_1 t8 -#define t_2 t9 - -#define c_1 v0 -#define c_2 v1 -#define c_3 a3 - -#define FRAME_SIZE 48 - -.align 5 -LEAF(bn_mul_comba8) - .set noreorder - PTR_SUB sp,FRAME_SIZE - .frame sp,64,ra - .set reorder - ld a_0,0(a1) /* If compiled with -mips3 option on - * R5000 box assembler barks on this - * line with "shouldn't have mult/div - * as last instruction in bb (R10K - * bug)" warning. If anybody out there - * has a clue about how to circumvent - * this do send me a note. - * <appro@fy.chalmers.se> - */ - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - sd s0,0(sp) - sd s1,8(sp) - sd s2,16(sp) - sd s3,24(sp) - sd s4,32(sp) - sd s5,40(sp) - mflo c_1 - mfhi c_2 - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - ld b_4,32(a2) - ld b_5,40(a2) - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - ld b_6,48(a2) - ld b_7,56(a2) - sd c_1,0(a0) /* r[0]=c1; */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) /* r[1]=c2; */ - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) /* r[2]=c3; */ - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) /* r[3]=c1; */ - - dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) /* r[4]=c2; */ - - dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) /* r[5]=c3; */ - - dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) /* r[6]=c1; */ - - dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) /* r[7]=c2; */ - - dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) /* r[8]=c3; */ - - dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) /* r[9]=c1; */ - - dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) /* r[10]=c2; */ - - dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) /* r[11]=c3; */ - - dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) /* r[12]=c1; */ - - dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) /* r[13]=c2; */ - - dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - ld s0,0(sp) - ld s1,8(sp) - ld s2,16(sp) - ld s3,24(sp) - ld s4,32(sp) - ld s5,40(sp) - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) /* r[14]=c3; */ - sd c_1,120(a0) /* r[15]=c1; */ - - PTR_ADD sp,FRAME_SIZE - - jr ra -END(bn_mul_comba8) - -.align 5 -LEAF(bn_mul_comba4) - .set reorder - ld a_0,0(a1) - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_mul_comba4) - -#undef a_4 -#undef a_5 -#undef a_6 -#undef a_7 -#define a_4 b_0 -#define a_5 b_1 -#define a_6 b_2 -#define a_7 b_3 - -.align 5 -LEAF(bn_sqr_comba8) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) - - dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) - - dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) - - dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) - - dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) - - dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) - - dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) - - dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) - - dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) - sd c_1,120(a0) - - jr ra -END(bn_sqr_comba8) - -.align 5 -LEAF(bn_sqr_comba4) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_sqr_comba4) |