aboutsummaryrefslogtreecommitdiff
path: root/pixman
diff options
context:
space:
mode:
Diffstat (limited to 'pixman')
-rw-r--r--pixman/pixman/pixman-arm-neon-asm.h20
-rw-r--r--pixman/pixman/pixman-arm-simd-asm.S370
-rw-r--r--pixman/pixman/pixman-arm-simd-asm.h89
-rw-r--r--pixman/pixman/pixman-arm-simd.c15
-rw-r--r--pixman/pixman/pixman-bits-image.c2
5 files changed, 465 insertions, 31 deletions
diff --git a/pixman/pixman/pixman-arm-neon-asm.h b/pixman/pixman/pixman-arm-neon-asm.h
index d0d92d74c..bdcf6a9d4 100644
--- a/pixman/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman/pixman-arm-neon-asm.h
@@ -631,14 +631,8 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
+ pixman_asm_function fname
+
push {r4-r12, lr} /* save all registers */
/*
@@ -945,14 +939,8 @@ fname:
src_basereg_ = 0, \
mask_basereg_ = 24
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
+ pixman_asm_function fname
+
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
/*
* Make some macro arguments globally visible and accessible
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S
index dd6f78817..bc02ebb57 100644
--- a/pixman/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman/pixman-arm-simd-asm.S
@@ -37,6 +37,7 @@
.altmacro
.p2align 2
+#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
/* A head macro should do all processing which results in an output of up to
@@ -689,3 +690,372 @@ generate_composite_function \
/******************************************************************************/
+.macro over_white_8888_8888_ca_init
+ HALF .req SRC
+ TMP0 .req STRIDE_D
+ TMP1 .req STRIDE_S
+ TMP2 .req STRIDE_M
+ TMP3 .req ORIG_W
+ WK4 .req SCRATCH
+ line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+ ldr SCRATCH, =0x800080
+ mov HALF, #0x80
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ .set DST_PRELOAD_BIAS, 8
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+ .set DST_PRELOAD_BIAS, 0
+ .unreq HALF
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine m, d
+ uxtb16 TMP1, TMP0 /* rb_notmask */
+ uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ smlatt TMP3, TMP2, TMP1, HALF /* red */
+ smlabb TMP2, TMP2, TMP1, HALF /* blue */
+ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
+ uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt d, TMP1, TMP0, HALF /* alpha */
+ smlabb TMP1, TMP1, TMP0, HALF /* green */
+ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+ pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ uxtab16 TMP0, TMP0, TMP0, ror #8
+ uxtab16 TMP1, TMP1, TMP1, ror #8
+ mov TMP0, TMP0, ror #8
+ sel d, TMP0, TMP1
+ uqadd8 d, d, m /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+ pixld , 4, 1, MASK, 0
+ pixld , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ bcc 03f
+ mov WK3, WK1
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: pixst , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+ pixld , 8, 1, MASK, 0
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+ pixld , 8, 3, DST
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ movcs WK3, WK1
+ bcs 02f
+ teq WK2, #0
+ beq 05f
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: mvn TMP0, WK2
+ teq WK2, WK2, asr #32
+ bne 03f
+ movcs WK4, WK2
+ b 04f
+03: over_white_8888_8888_ca_combine WK2, WK4
+04: pixst , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_head
+ .else
+ .if numbytes == 16
+ over_white_8888_8888_ca_2pixels_head
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_tail
+ .else
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+ 2, /* prefetch distance */ \
+ over_white_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_white_8888_8888_ca_cleanup, \
+ over_white_8888_8888_ca_process_head, \
+ over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+ /* Set up constants. RB_SRC and AG_SRC are in registers;
+ * RB_FLDS, A_SRC, and the two HALF values need to go on the
+ * stack (and the ful SRC value is already there) */
+ ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
+ mov WK0, #0x00FF0000
+ orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
+ mov WK1, #0x80 /* HALF default value */
+ mov WK2, SCRATCH, lsr #24 /* A_SRC */
+ orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+ push {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+ uxtb16 SRC, SCRATCH
+ uxtb16 STRIDE_S, SCRATCH, ror #8
+
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, WK3, WK3
+
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ WK0 .req Y
+ WK1 .req STRIDE_D
+ RB_SRC .req SRC
+ AG_SRC .req STRIDE_S
+ WK2 .req STRIDE_M
+ RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
+ A_SRC .req r8
+ HALF .req r9
+ WK3 .req r10
+ WK4 .req r11
+ WK5 .req SCRATCH
+ WK6 .req ORIG_W
+
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+ add sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+ .unreq WK0
+ .unreq WK1
+ .unreq RB_SRC
+ .unreq AG_SRC
+ .unreq WK2
+ .unreq RB_FLDS
+ .unreq A_SRC
+ .unreq HALF
+ .unreq WK3
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ WK0 .req r8
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+ pixld , 4, 6, MASK, 0
+ pixld , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+ ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+ uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+ teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
+ bne 20f
+ bcc 40f
+ /* Mask is fully opaque (all channels) */
+ ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+ eors A_SRC, A_SRC, #0xFF
+ bne 10f
+ /* Source is also opaque - same as src_8888_8888 */
+ mov WK0, WK6
+ b 30f
+10: /* Same as over_8888_8888 */
+ mul_8888_8 WK0, A_SRC, WK5, HALF
+ uqadd8 WK0, WK0, WK6
+ b 30f
+20: /* No simplifications possible - do it the hard way */
+ uxtb16 WK2, WK6, ror #8 /* ag_mask */
+ mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
+ mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
+ ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+ uxtb16 WK5, WK0 /* rb_dest */
+ uxtab16 WK3, WK3, WK3, ror #8
+ uxtb16 WK6, WK0, ror #8 /* ag_dest */
+ uxtab16 WK4, WK4, WK4, ror #8
+ smlatt WK0, RB_SRC, WK1, HALF /* red1 */
+ smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
+ bic WK3, RB_FLDS, WK3, lsr #8
+ bic WK4, RB_FLDS, WK4, lsr #8
+ pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
+ smlatt WK0, WK5, WK3, HALF /* red2 */
+ smlabb WK3, WK5, WK3, HALF /* blue2 */
+ uxtab16 WK1, WK1, WK1, ror #8
+ smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
+ pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
+ smlabb WK0, AG_SRC, WK2, HALF /* green1 */
+ smlatt WK2, WK6, WK4, HALF /* alpha2 */
+ smlabb WK4, WK6, WK4, HALF /* green2 */
+ pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
+ uxtab16 WK3, WK3, WK3, ror #8
+ pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
+ uxtab16 WK0, WK0, WK0, ror #8
+ uxtab16 WK4, WK4, WK4, ror #8
+ mov WK1, WK1, ror #8
+ mov WK3, WK3, ror #8
+ sel WK2, WK1, WK0 /* recombine source*mask */
+ sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
+ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
+30: /* The destination buffer is already in the L1 cache, so
+ * there's little point in amalgamating writes */
+ pixst , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+ over_n_8888_8888_ca_1pixel_head
+ over_n_8888_8888_ca_1pixel_tail
+ .endr
+ over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
+ over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+ ldr ip, [sp]
+ cmp ip, #-1
+ beq pixman_composite_over_white_8888_8888_ca_asm_armv6
+ /* else drop through... */
+ .endfunc
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+ 2, /* prefetch distance */ \
+ over_n_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_n_8888_8888_ca_cleanup, \
+ over_n_8888_8888_ca_process_head, \
+ over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
+.macro in_reverse_8888_8888_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ /* Offset the source pointer: we only need the alpha bytes */
+ add SRC, SRC, #3
+ line_saved_regs ORIG_W
+.endm
+
+.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
+ ldrb ORIG_W, [SRC], #4
+ .if numbytes >= 8
+ ldrb WK&reg1, [SRC], #4
+ .if numbytes == 16
+ ldrb WK&reg2, [SRC], #4
+ ldrb WK&reg3, [SRC], #4
+ .endif
+ .endif
+ add DST, DST, #numbytes
+.endm
+
+.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+.endm
+
+.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
+ .if is_only != 1
+ movs s, ORIG_W
+ .if offset != 0
+ ldrb ORIG_W, [SRC, #offset]
+ .endif
+ beq 01f
+ teq STRIDE_M, #0xFF
+ beq 02f
+ .endif
+ uxtb16 SCRATCH, d /* rb_dest */
+ uxtb16 d, d, ror #8 /* ag_dest */
+ mla SCRATCH, SCRATCH, s, MASK
+ mla d, d, s, MASK
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 d, d, d, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel d, SCRATCH, d
+ b 02f
+ .if offset == 0
+48: /* Last mov d,#0 of the set - used as part of shortcut for
+ * source values all 0 */
+ .endif
+01: mov d, #0
+02:
+.endm
+
+.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+ teq ORIG_W, ORIG_W, asr #32
+ ldrne WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+ teq ORIG_W, WK&reg1
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK&reg1-WK&reg2}
+ .else
+ teq ORIG_W, WK&reg1
+ teqeq ORIG_W, WK&reg2
+ teqeq ORIG_W, WK&reg3
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK&reg1-WK&reg4}
+ .endif
+ cmnne DST, #0 /* clear C if NE */
+ bcs 49f /* no writes to dest if source all -1 */
+ beq 48f /* set dest to all 0 if source all 0 */
+ .if numbytes == 4
+ in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
+ str WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
+ stmdb DST, {WK&reg1-WK&reg2}
+ .else
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
+ stmdb DST, {WK&reg1-WK&reg4}
+ .endif
+49:
+.endm
+
+.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
+ in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+ 2, /* prefetch distance */ \
+ in_reverse_8888_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ in_reverse_8888_8888_process_head, \
+ in_reverse_8888_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman/pixman-arm-simd-asm.h b/pixman/pixman/pixman-arm-simd-asm.h
index 24b1ad2f9..8de060a6b 100644
--- a/pixman/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman/pixman-arm-simd-asm.h
@@ -76,6 +76,16 @@
.set FLAG_SPILL_LINE_VARS, 48
.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0, 0
+.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST, 0
+.set FLAG_NO_PRELOAD_DST, 256
+
+/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
/*
* Offset into stack where mask and source pointer/stride can be accessed.
@@ -87,6 +97,11 @@
#endif
/*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET, 0
+
+/*
* Constants for selecting preferable prefetch type.
*/
.set PREFETCH_TYPE_NONE, 0
@@ -359,23 +374,41 @@
.macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .endif
.endm
.macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .endif
.endm
.macro leading_15bytes process_head, process_tail
/* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set DECREMENT_X, 0
+ sub X, X, WK0, lsr #dst_bpp_shift
+ str X, [sp, #LINE_SAVED_REG_COUNT*4]
+ mov X, WK0
+ .endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
+ conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+ conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
.endm
.macro test_bits_3_2_pix
@@ -414,7 +447,7 @@
preload_middle src_bpp, SRC, 0
preload_middle mask_bpp, MASK, 0
.endif
- .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+ .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
/* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
* destination prefetches are 32-byte aligned. It's also the easiest channel to offset
* preloads for, to achieve staggered prefetches for multiple channels, because there are
@@ -437,11 +470,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@@ -449,7 +482,9 @@
.endif
preload_trailing src_bpp, src_bpp_shift, SRC
preload_trailing mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_trailing dst_r_bpp, dst_bpp_shift, DST
+ .endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
/* The remainder of the line is handled identically to the medium case */
medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
@@ -561,13 +596,7 @@
process_tail, \
process_inner_loop
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
+ pixman_asm_function fname
/*
* Make some macro arguments globally visible and accessible
@@ -679,7 +708,6 @@
SCRATCH .req r12
ORIG_W .req r14 /* width (pixels) */
-fname:
push {r4-r11, lr} /* save all registers */
subs Y, Y, #1
@@ -705,6 +733,13 @@ fname:
#endif
init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ /* Reserve a word in which to store X during leading pixels */
+ sub sp, sp, #4
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
@@ -734,12 +769,16 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_leading_step1 dst_r_bpp, WK3, DST
+ .endif
ands WK0, DST, #15
beq 154f
@@ -747,7 +786,9 @@ fname:
preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
+ .endif
leading_15bytes process_head, process_tail
@@ -767,6 +808,10 @@ fname:
157: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
.endif
.ltorg
@@ -776,12 +821,16 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
ands WK0, DST, #15
@@ -810,7 +859,9 @@ fname:
newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
.if dst_w_bpp == 8
tst DST, #3
@@ -841,12 +892,22 @@ fname:
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
197:
.if (flags) & FLAG_SPILL_LINE_VARS
add sp, sp, #LINE_SAVED_REG_COUNT*4
.endif
198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+ add sp, sp, #4
+ .endif
+
cleanup
#ifdef DEBUG_PARAMS
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
index 8fbc4397d..c17ce5a37 100644
--- a/pixman/pixman/pixman-arm-simd.c
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -46,6 +46,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
uint32_t, 1)
@@ -56,6 +58,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
uint16_t, uint16_t)
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -238,6 +243,16 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888),
+
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index f9121a365..dcdcc6994 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -926,7 +926,7 @@ create_bits (pixman_format_code_t format,
if (_pixman_multiply_overflows_size (height, stride))
return NULL;
- buf_size = height * stride;
+ buf_size = (size_t)height * stride;
if (rowstride_bytes)
*rowstride_bytes = stride;