diff options
Diffstat (limited to 'pixman/pixman/pixman-arm-simd-asm.h')
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm.h | 89 |
1 files changed, 75 insertions, 14 deletions
diff --git a/pixman/pixman/pixman-arm-simd-asm.h b/pixman/pixman/pixman-arm-simd-asm.h index 24b1ad2f9..8de060a6b 100644 --- a/pixman/pixman/pixman-arm-simd-asm.h +++ b/pixman/pixman/pixman-arm-simd-asm.h @@ -76,6 +76,16 @@ .set FLAG_SPILL_LINE_VARS, 48 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 +.set FLAG_PROCESS_PRESERVES_WK0, 0 +.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ +.set FLAG_PRELOAD_DST, 0 +.set FLAG_NO_PRELOAD_DST, 256 + +/* + * Number of bytes by which to adjust preload offset of destination + * buffer (allows preload instruction to be moved before the load(s)) + */ +.set DST_PRELOAD_BIAS, 0 /* * Offset into stack where mask and source pointer/stride can be accessed. @@ -87,6 +97,11 @@ #endif /* + * Offset into stack where space allocated during init macro can be accessed. + */ +.set LOCALS_STACK_OFFSET, 0 + +/* * Constants for selecting preferable prefetch type. */ .set PREFETCH_TYPE_NONE, 0 @@ -359,23 +374,41 @@ .macro test_bits_1_0_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ + .else movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ + .endif .endm .macro test_bits_3_2_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ + .else movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ + .endif .endm .macro leading_15bytes process_head, process_tail /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ + .set DECREMENT_X, 1 + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set DECREMENT_X, 0 + sub X, X, WK0, lsr #dst_bpp_shift + str X, [sp, #LINE_SAVED_REG_COUNT*4] + mov X, WK0 + .endif /* Use unaligned loads in all cases for simplicity */ .if dst_w_bpp == 8 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 + conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X .elseif dst_w_bpp == 16 test_bits_1_0_ptr - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 + conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X + .endif + conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + ldr X, [sp, #LINE_SAVED_REG_COUNT*4] .endif - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 .endm .macro test_bits_3_2_pix @@ -414,7 +447,7 @@ preload_middle src_bpp, SRC, 0 preload_middle mask_bpp, MASK, 0 .endif - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that * destination prefetches are 32-byte aligned. It's also the easiest channel to offset * preloads for, to achieve staggered prefetches for multiple channels, because there are @@ -437,11 +470,11 @@ .if dst_r_bpp > 0 tst DST, #16 bne 111f - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS b 112f 111: .endif - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS 112: /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) @@ -449,7 +482,9 @@ .endif preload_trailing src_bpp, src_bpp_shift, SRC preload_trailing mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_trailing dst_r_bpp, dst_bpp_shift, DST + .endif add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp /* The remainder of the line is handled identically to the medium case */ medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask @@ -561,13 +596,7 @@ process_tail, \ process_inner_loop - .func fname - .global fname - /* For ELF format also set function visibility to hidden */ -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif + pixman_asm_function fname /* * Make some macro arguments globally visible and accessible @@ -679,7 +708,6 @@ SCRATCH .req r12 ORIG_W .req r14 /* width (pixels) */ -fname: push {r4-r11, lr} /* save all registers */ subs Y, Y, #1 @@ -705,6 +733,13 @@ fname: #endif init + + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + /* Reserve a word in which to store X during leading pixels */ + sub sp, sp, #4 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 + .endif lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift @@ -734,12 +769,16 @@ fname: .if (flags) & FLAG_SPILL_LINE_VARS_WIDE /* This is stmdb sp!,{} */ .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .endif 151: /* New line */ newline preload_leading_step1 src_bpp, WK1, SRC preload_leading_step1 mask_bpp, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step1 dst_r_bpp, WK3, DST + .endif ands WK0, DST, #15 beq 154f @@ -747,7 +786,9 @@ fname: preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + .endif leading_15bytes process_head, process_tail @@ -767,6 +808,10 @@ fname: 157: /* Check for another line */ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif .endif .ltorg @@ -776,12 +821,16 @@ fname: .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE /* This is stmdb sp!,{} */ .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .endif 161: /* New line */ newline preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 0, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 0, dst_r_bpp, dst_bpp_shift, DST + .endif sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ ands WK0, DST, #15 @@ -810,7 +859,9 @@ fname: newline preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 1, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 1, dst_r_bpp, dst_bpp_shift, DST + .endif .if dst_w_bpp == 8 tst DST, #3 @@ -841,12 +892,22 @@ fname: 177: /* Check for another line */ end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif 197: .if (flags) & FLAG_SPILL_LINE_VARS add sp, sp, #LINE_SAVED_REG_COUNT*4 .endif 198: + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 + add sp, sp, #4 + .endif + cleanup #ifdef DEBUG_PARAMS |