aboutsummaryrefslogtreecommitdiff
path: root/pixman
diff options
context:
space:
mode:
Diffstat (limited to 'pixman')
-rw-r--r--pixman/pixman/pixman-arm-neon-asm.S22
-rw-r--r--pixman/pixman/pixman-arm-neon.c4
-rw-r--r--pixman/pixman/pixman-arm-simd-asm.S660
-rw-r--r--pixman/pixman/pixman-arm-simd.c834
-rw-r--r--pixman/pixman/pixman-fast-path.c28
-rw-r--r--pixman/pixman/pixman-mmx.c46
-rw-r--r--pixman/pixman/pixman-sse2.c689
-rw-r--r--pixman/test/lowlevel-blt-bench.c6
8 files changed, 823 insertions, 1466 deletions
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index fe128aa94..108abacd1 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -495,15 +495,15 @@ generate_composite_function \
/******************************************************************************/
-.macro pixman_composite_add_8000_8000_process_pixblock_head
+.macro pixman_composite_add_8_8_process_pixblock_head
vqadd.u8 q14, q0, q2
vqadd.u8 q15, q1, q3
.endm
-.macro pixman_composite_add_8000_8000_process_pixblock_tail
+.macro pixman_composite_add_8_8_process_pixblock_tail
.endm
-.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
vld1.8 {d0, d1, d2, d3}, [SRC]!
PF add PF_X, PF_X, #32
PF tst PF_CTL, #0xF
@@ -523,15 +523,15 @@ generate_composite_function \
.endm
generate_composite_function \
- pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
FLAG_DST_READWRITE, \
32, /* number of pixels, processed in a single block */ \
10, /* prefetch distance */ \
default_init, \
default_cleanup, \
- pixman_composite_add_8000_8000_process_pixblock_head, \
- pixman_composite_add_8000_8000_process_pixblock_tail, \
- pixman_composite_add_8000_8000_process_pixblock_tail_head
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
/******************************************************************************/
@@ -561,8 +561,8 @@ generate_composite_function \
10, /* prefetch distance */ \
default_init, \
default_cleanup, \
- pixman_composite_add_8000_8000_process_pixblock_head, \
- pixman_composite_add_8000_8000_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
pixman_composite_add_8888_8888_process_pixblock_tail_head
generate_composite_function_single_scanline \
@@ -571,8 +571,8 @@ generate_composite_function_single_scanline \
8, /* number of pixels, processed in a single block */ \
default_init, \
default_cleanup, \
- pixman_composite_add_8000_8000_process_pixblock_head, \
- pixman_composite_add_8000_8000_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
pixman_composite_add_8888_8888_process_pixblock_tail_head
/******************************************************************************/
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c
index 28a66751a..231a183aa 100644
--- a/pixman/pixman/pixman-arm-neon.c
+++ b/pixman/pixman/pixman-arm-neon.c
@@ -52,7 +52,7 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
uint8_t, 3, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8000_8000,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
uint32_t, 1, uint32_t, 1)
@@ -257,7 +257,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8000_8000),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S
index 1a1a0d641..76647c6bc 100644
--- a/pixman/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman/pixman-arm-simd-asm.S
@@ -1,330 +1,330 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-
-/* Prevent the stack from becoming executable */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
- .text
- .arch armv6
- .object_arch armv4
- .arm
- .altmacro
-
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
- .func fname
- .global fname
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
-.endm
-
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8000_8000_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- mov r10, r1
- sub sp, sp, #4
- subs r10, r10, #1
- mov r11, r0
- mov r8, r2
- str r3, [sp]
- ldr r7, [sp, #36]
- bcc 0f
-6: cmp r11, #0
- beq 1f
- orr r3, r8, r7
- tst r3, #3
- beq 2f
- mov r1, r8
- mov r0, r7
- mov r12, r11
- b 3f
-5: tst r3, #3
- beq 4f
-3: ldrb r2, [r0], #1
- subs r12, r12, #1
- ldrb r3, [r1]
- uqadd8 r3, r2, r3
- strb r3, [r1], #1
- orr r3, r1, r0
- bne 5b
-1: ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
-10: subs r10, r10, #1
- bcs 6b
-0: add sp, sp, #4
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-2: mov r12, r11
- mov r1, r8
- mov r0, r7
-4: cmp r12, #3
- subgt r6, r12, #4
- movgt r9, r12
- lsrgt r5, r6, #2
- addgt r3, r5, #1
- movgt r12, #0
- lslgt r4, r3, #2
- ble 7f
-8: ldr r3, [r0, r12]
- ldr r2, [r1, r12]
- uqadd8 r3, r3, r2
- str r3, [r1, r12]
- add r12, r12, #4
- cmp r12, r4
- bne 8b
- sub r3, r9, #4
- bic r3, r3, #3
- add r3, r3, #4
- subs r12, r6, r5, lsl #2
- add r1, r1, r3
- add r0, r0, r3
- beq 1b
-7: mov r4, #0
-9: ldrb r3, [r1, r4]
- ldrb r2, [r0, r4]
- uqadd8 r3, r2, r3
- strb r3, [r1, r4]
- add r4, r4, #1
- cmp r4, r12
- bne 9b
- ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
- b 10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #20
- cmp r1, #0
- mov r12, r2
- str r1, [sp, #12]
- str r0, [sp, #16]
- ldr r2, [sp, #52]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp]
- ldr r3, [sp, #56]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-6: ldr r11, [sp, #8]
-1: ldr r9, [sp]
- mov r0, r12
- add r12, r12, r9
- mov r1, r2
- str r12, [sp, #4]
- add r2, r2, r11
- ldr r12, [sp, #16]
- ldr r3, =0x00800080
- ldr r9, =0xff00ff00
- mov r11, #255
- cmp r12, #0
- beq 4f
-5: ldr r5, [r1], #4
- ldr r4, [r0]
- sub r8, r11, r5, lsr #24
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mla r6, r6, r8, r3
- mla r7, r7, r8, r3
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- and r7, r7, r9
- uxtab16 r6, r7, r6, ror #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 5b
-4: ldr r3, [sp, #12]
- add r10, r10, #1
- cmp r10, r3
- ldr r12, [sp, #4]
- bne 6b
-0: add sp, sp, #20
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- str r1, [sp, #12]
- ldrb r1, [sp, #71]
- mov r12, r2
- str r0, [sp, #16]
- ldr r2, [sp, #60]
- str r1, [sp, #24]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #20]
- ldr r3, [sp, #64]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-5: ldr r11, [sp, #8]
-1: ldr r4, [sp, #20]
- mov r0, r12
- mov r1, r2
- add r12, r12, r4
- add r2, r2, r11
- str r12, [sp]
- str r2, [sp, #4]
- ldr r12, [sp, #16]
- ldr r2, =0x00800080
- ldr r3, [sp, #24]
- mov r11, #255
- cmp r12, #0
- beq 3f
-4: ldr r5, [r1], #4
- ldr r4, [r0]
- uxtb16 r6, r5
- uxtb16 r7, r5, ror #8
- mla r6, r6, r3, r2
- mla r7, r7, r3, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- sub r8, r11, r5, lsr #24
- mla r6, r6, r8, r2
- mla r7, r7, r8, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r1, [sp, #12]
- add r10, r10, #1
- cmp r10, r1
- ldr r12, [sp]
- ldr r2, [sp, #4]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- ldr r9, [sp, #60]
- str r1, [sp, #12]
- bic r1, r9, #-16777216
- str r1, [sp, #20]
- mov r12, r2
- lsr r1, r9, #8
- ldr r2, [sp, #20]
- bic r1, r1, #-16777216
- bic r2, r2, #65280
- bic r1, r1, #65280
- str r2, [sp, #20]
- str r0, [sp, #16]
- str r1, [sp, #4]
- ldr r2, [sp, #68]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #24]
- mov r0, #0
- b 1f
-5: ldr r3, [sp, #24]
-1: ldr r4, [sp, #72]
- mov r10, r12
- mov r1, r2
- add r12, r12, r3
- add r2, r2, r4
- str r12, [sp, #8]
- str r2, [sp]
- ldr r12, [sp, #16]
- ldr r11, =0x00800080
- ldr r2, [sp, #4]
- ldr r3, [sp, #20]
- cmp r12, #0
- beq 3f
-4: ldrb r5, [r1], #1
- ldr r4, [r10]
- mla r6, r3, r5, r11
- mla r7, r2, r5, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mvn r8, r5
- lsr r8, r8, #24
- mla r6, r6, r8, r11
- mla r7, r7, r8, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r10], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r4, [sp, #12]
- add r0, r0, #1
- cmp r0, r4
- ldr r12, [sp, #8]
- ldr r2, [sp]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ mov r10, r1
+ sub sp, sp, #4
+ subs r10, r10, #1
+ mov r11, r0
+ mov r8, r2
+ str r3, [sp]
+ ldr r7, [sp, #36]
+ bcc 0f
+6: cmp r11, #0
+ beq 1f
+ orr r3, r8, r7
+ tst r3, #3
+ beq 2f
+ mov r1, r8
+ mov r0, r7
+ mov r12, r11
+ b 3f
+5: tst r3, #3
+ beq 4f
+3: ldrb r2, [r0], #1
+ subs r12, r12, #1
+ ldrb r3, [r1]
+ uqadd8 r3, r2, r3
+ strb r3, [r1], #1
+ orr r3, r1, r0
+ bne 5b
+1: ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+10: subs r10, r10, #1
+ bcs 6b
+0: add sp, sp, #4
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+2: mov r12, r11
+ mov r1, r8
+ mov r0, r7
+4: cmp r12, #3
+ subgt r6, r12, #4
+ movgt r9, r12
+ lsrgt r5, r6, #2
+ addgt r3, r5, #1
+ movgt r12, #0
+ lslgt r4, r3, #2
+ ble 7f
+8: ldr r3, [r0, r12]
+ ldr r2, [r1, r12]
+ uqadd8 r3, r3, r2
+ str r3, [r1, r12]
+ add r12, r12, #4
+ cmp r12, r4
+ bne 8b
+ sub r3, r9, #4
+ bic r3, r3, #3
+ add r3, r3, #4
+ subs r12, r6, r5, lsl #2
+ add r1, r1, r3
+ add r0, r0, r3
+ beq 1b
+7: mov r4, #0
+9: ldrb r3, [r1, r4]
+ ldrb r2, [r0, r4]
+ uqadd8 r3, r2, r3
+ strb r3, [r1, r4]
+ add r4, r4, #1
+ cmp r4, r12
+ bne 9b
+ ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+ b 10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #20
+ cmp r1, #0
+ mov r12, r2
+ str r1, [sp, #12]
+ str r0, [sp, #16]
+ ldr r2, [sp, #52]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp]
+ ldr r3, [sp, #56]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+6: ldr r11, [sp, #8]
+1: ldr r9, [sp]
+ mov r0, r12
+ add r12, r12, r9
+ mov r1, r2
+ str r12, [sp, #4]
+ add r2, r2, r11
+ ldr r12, [sp, #16]
+ ldr r3, =0x00800080
+ ldr r9, =0xff00ff00
+ mov r11, #255
+ cmp r12, #0
+ beq 4f
+5: ldr r5, [r1], #4
+ ldr r4, [r0]
+ sub r8, r11, r5, lsr #24
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mla r6, r6, r8, r3
+ mla r7, r7, r8, r3
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ and r7, r7, r9
+ uxtab16 r6, r7, r6, ror #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 5b
+4: ldr r3, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r3
+ ldr r12, [sp, #4]
+ bne 6b
+0: add sp, sp, #20
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ str r1, [sp, #12]
+ ldrb r1, [sp, #71]
+ mov r12, r2
+ str r0, [sp, #16]
+ ldr r2, [sp, #60]
+ str r1, [sp, #24]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #20]
+ ldr r3, [sp, #64]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+5: ldr r11, [sp, #8]
+1: ldr r4, [sp, #20]
+ mov r0, r12
+ mov r1, r2
+ add r12, r12, r4
+ add r2, r2, r11
+ str r12, [sp]
+ str r2, [sp, #4]
+ ldr r12, [sp, #16]
+ ldr r2, =0x00800080
+ ldr r3, [sp, #24]
+ mov r11, #255
+ cmp r12, #0
+ beq 3f
+4: ldr r5, [r1], #4
+ ldr r4, [r0]
+ uxtb16 r6, r5
+ uxtb16 r7, r5, ror #8
+ mla r6, r6, r3, r2
+ mla r7, r7, r3, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ sub r8, r11, r5, lsr #24
+ mla r6, r6, r8, r2
+ mla r7, r7, r8, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r1, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r1
+ ldr r12, [sp]
+ ldr r2, [sp, #4]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ ldr r9, [sp, #60]
+ str r1, [sp, #12]
+ bic r1, r9, #-16777216
+ str r1, [sp, #20]
+ mov r12, r2
+ lsr r1, r9, #8
+ ldr r2, [sp, #20]
+ bic r1, r1, #-16777216
+ bic r2, r2, #65280
+ bic r1, r1, #65280
+ str r2, [sp, #20]
+ str r0, [sp, #16]
+ str r1, [sp, #4]
+ ldr r2, [sp, #68]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #24]
+ mov r0, #0
+ b 1f
+5: ldr r3, [sp, #24]
+1: ldr r4, [sp, #72]
+ mov r10, r12
+ mov r1, r2
+ add r12, r12, r3
+ add r2, r2, r4
+ str r12, [sp, #8]
+ str r2, [sp]
+ ldr r12, [sp, #16]
+ ldr r11, =0x00800080
+ ldr r2, [sp, #4]
+ ldr r3, [sp, #20]
+ cmp r12, #0
+ beq 3f
+4: ldrb r5, [r1], #1
+ ldr r4, [r10]
+ mla r6, r3, r5, r11
+ mla r7, r2, r5, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mvn r8, r5
+ lsr r8, r8, #24
+ mla r6, r6, r8, r11
+ mla r7, r7, r8, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r10], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r4, [sp, #12]
+ add r0, r0, #1
+ cmp r0, r4
+ ldr r12, [sp, #8]
+ ldr r2, [sp]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
index 389c9e01a..76a7ffeab 100644
--- a/pixman/pixman/pixman-arm-simd.c
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -1,417 +1,417 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include "pixman-private.h"
-#include "pixman-arm-common.h"
-
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8000_8000_asm_armv6 (int32_t width,
- int32_t height,
- uint8_t *dst_line,
- int32_t dst_stride,
- uint8_t *src_line,
- int32_t src_stride)
-{
- uint8_t *dst, *src;
- int32_t w;
- uint8_t s, d;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- /* ensure both src and dst are properly aligned before doing 32 bit reads
- * we'll stay in this loop if src and dst have differing alignments
- */
- while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
- {
- s = *src;
- d = *dst;
- asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
- *dst = d;
-
- dst++;
- src++;
- w--;
- }
-
- while (w >= 4)
- {
- asm ("uqadd8 %0, %1, %2"
- : "=r" (*(uint32_t*)dst)
- : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *src;
- d = *dst;
- asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
- *dst = d;
-
- dst++;
- src++;
- w--;
- }
- }
-
-}
-
-void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t *src_line,
- int32_t src_stride)
-{
- uint32_t *dst;
- uint32_t *src;
- int32_t w;
- uint32_t component_half = 0x800080;
- uint32_t upper_component_mask = 0xff00ff00;
- uint32_t alpha_mask = 0xff;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load src */
- "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
- /* = 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
- "ldr r4, [%[dest]] \n\t"
-
-#else
- "ldr r4, [%[dest]] \n\t"
-
- /* = 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* multiply by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- /* recombine the 0xff00ff00 bytes of r6 and r7 */
- "and r7, r7, %[upper_component_mask]\n\t"
- "uxtab16 r6, r7, r6, ror #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
- : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
- [alpha_mask] "r" (alpha_mask)
- : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
- );
- }
-}
-
-void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t *src_line,
- int32_t src_stride,
- uint32_t mask)
-{
- uint32_t *dst;
- uint32_t *src;
- int32_t w;
- uint32_t component_half = 0x800080;
- uint32_t alpha_mask = 0xff;
-
- mask = (mask) >> 24;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load src */
- "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
-#endif
- "ldr r4, [%[dest]] \n\t"
-
- "uxtb16 r6, r5\n\t"
- "uxtb16 r7, r5, ror #8\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
- "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r5, r6, r7, lsl #8\n\t"
-
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r6, r6, r7, lsl #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
- : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
- [alpha_mask] "r" (alpha_mask)
- : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
- );
- }
-}
-
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t src,
- int32_t unused,
- uint8_t *mask_line,
- int32_t mask_stride)
-{
- uint32_t srca;
- uint32_t *dst;
- uint8_t *mask;
- int32_t w;
-
- srca = src >> 24;
-
- uint32_t component_mask = 0xff00ff;
- uint32_t component_half = 0x800080;
-
- uint32_t src_hi = (src >> 8) & component_mask;
- uint32_t src_lo = src & component_mask;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load mask */
- "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
-#endif
- "ldr r4, [%[dest]] \n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, %[src_lo], r5, %[component_half]\n\t"
- "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r5, r6, r7, lsl #8\n\t"
-
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* we could simplify this to use 'sub' if we were
- * willing to give up a register for alpha_mask
- */
- "mvn r8, r5\n\t"
- "mov r8, r8, lsr #24\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r6, r6, r7, lsl #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
- : [component_half] "r" (component_half),
- [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
- : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
- }
-}
-
-#endif
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000,
- uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
- uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
- uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
- uint8_t, 1, uint32_t, 1)
-
-static const pixman_fast_path_t arm_simd_fast_paths[] =
-{
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
- PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
-
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8000_8000),
-
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
- PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
-
- { PIXMAN_OP_NONE },
-};
-
-pixman_implementation_t *
-_pixman_implementation_create_arm_simd (void)
-{
- pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
- pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
-
- return imp;
-}
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t width,
+ int32_t height,
+ uint8_t *dst_line,
+ int32_t dst_stride,
+ uint8_t *src_line,
+ int32_t src_stride)
+{
+ uint8_t *dst, *src;
+ int32_t w;
+ uint8_t s, d;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ /* ensure both src and dst are properly aligned before doing 32 bit reads
+ * we'll stay in this loop if src and dst have differing alignments
+ */
+ while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ asm ("uqadd8 %0, %1, %2"
+ : "=r" (*(uint32_t*)dst)
+ : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *src;
+ d = *dst;
+ asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+ *dst = d;
+
+ dst++;
+ src++;
+ w--;
+ }
+ }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t upper_component_mask = 0xff00ff00;
+ uint32_t alpha_mask = 0xff;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ "ldr r4, [%[dest]] \n\t"
+
+#else
+ "ldr r4, [%[dest]] \n\t"
+
+ /* = 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* multiply by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ /* recombine the 0xff00ff00 bytes of r6 and r7 */
+ "and r7, r7, %[upper_component_mask]\n\t"
+ "uxtab16 r6, r7, r6, ror #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t *src_line,
+ int32_t src_stride,
+ uint32_t mask)
+{
+ uint32_t *dst;
+ uint32_t *src;
+ int32_t w;
+ uint32_t component_half = 0x800080;
+ uint32_t alpha_mask = 0xff;
+
+ mask = (mask) >> 24;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load src */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ "uxtb16 r6, r5\n\t"
+ "uxtb16 r7, r5, ror #8\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+ "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* 255 - alpha */
+ "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+ [alpha_mask] "r" (alpha_mask)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+ );
+ }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
+ int32_t height,
+ uint32_t *dst_line,
+ int32_t dst_stride,
+ uint32_t src,
+ int32_t unused,
+ uint8_t *mask_line,
+ int32_t mask_stride)
+{
+ uint32_t srca;
+ uint32_t *dst;
+ uint8_t *mask;
+ int32_t w;
+
+ srca = src >> 24;
+
+ uint32_t component_mask = 0xff00ff;
+ uint32_t component_half = 0x800080;
+
+ uint32_t src_hi = (src >> 8) & component_mask;
+ uint32_t src_lo = src & component_mask;
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+/* #define inner_branch */
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load mask */
+ "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that
+ */
+ "cmp r5, #0\n\t"
+ "beq 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, %[src_lo], r5, %[component_half]\n\t"
+ "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "uxtb16 r6, r4\n\t"
+ "uxtb16 r7, r4, ror #8\n\t"
+
+ /* we could simplify this to use 'sub' if we were
+ * willing to give up a register for alpha_mask
+ */
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "uxtab16 r6, r6, r6, ror #8\n\t"
+ "uxtab16 r7, r7, r7, ror #8\n\t"
+
+ "uxtb16 r6, r6, ror #8\n\t"
+ "uxtb16 r7, r7, ror #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+ : [component_half] "r" (component_half),
+ [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+ : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+ }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+ uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (void)
+{
+ pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+ pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
+
+ return imp;
+}
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 0b8a2526e..25ef9243b 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -910,19 +910,19 @@ fast_composite_src_x888_0565 (pixman_implementation_t *imp,
}
static void
-fast_composite_add_8000_8000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
@@ -1602,7 +1602,7 @@ static const pixman_fast_path_t c_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8000_8000),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index c4e7fb9fb..0b98282cb 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -152,6 +152,9 @@ to_m64 (uint64_t x)
#endif
}
+#ifdef _MSC_VER
+#define to_uint64(arg) arg.M64_MEMBER
+#else
static force_inline uint64_t
to_uint64 (__m64 x)
{
@@ -164,6 +167,7 @@ to_uint64 (__m64 x)
return (uint64_t)x;
#endif
}
+#endif
static force_inline __m64
shift (__m64 v,
@@ -310,11 +314,15 @@ pack8888 (__m64 lo, __m64 hi)
return _mm_packs_pu16 (lo, hi);
}
+#ifdef _MSC_VER
+#define store8888(v) _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()))
+#else
static force_inline uint32_t
store8888 (__m64 v)
{
return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
}
+#endif
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
*
@@ -417,6 +425,13 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
/* --------------- MMX code patch for fbcompose.c --------------------- */
+#ifdef _MSC_VER
+#define combine(src, mask) \
+ ((mask) ? \
+ store8888 (pix_multiply (load8888 (*src), expand_alpha (load8888 (*mask)))) \
+ : \
+ *src)
+#else
static force_inline uint32_t
combine (const uint32_t *src, const uint32_t *mask)
{
@@ -435,6 +450,7 @@ combine (const uint32_t *src, const uint32_t *mask)
return ssrc;
}
+#endif
static void
mmx_combine_over_u (pixman_implementation_t *imp,
@@ -448,7 +464,7 @@ mmx_combine_over_u (pixman_implementation_t *imp,
while (dest < end)
{
- uint32_t ssrc = combine (src, mask);
+ uint32_t ssrc = combine( src, mask);
uint32_t a = ssrc >> 24;
if (a == 0xff)
@@ -2845,19 +2861,19 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
}
static void
-mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
@@ -3268,7 +3284,7 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8000_8000 ),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 8e175b78d..9d9b16543 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
- _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
- _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
- if (addr)
- cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
- if (addr)
- cache_prefetch_next (addr);
-}
-
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure about
* the address alignment.
*/
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst,
const uint32_t* ps = src;
const uint32_t* pm = mask;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i s;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd,
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
w--;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
*dst++ = *src++ | 0xff000000;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w >= 16)
{
__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
-
xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
/* It's a 8 pixel loop */
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
return FALSE;
}
- cache_prefetch ((__m128i*)byte_line);
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
byte_line += stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 1 && ((unsigned long)d & 1))
{
*(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 128)
{
- cache_prefetch (((__m128i*)d) + 12);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
if (w >= 64)
{
- cache_prefetch (((__m128i*)d) + 8);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)d);
-
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 16;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask_line += mask_stride;
dst_line += dst_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
save_128_aligned (
(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
@@ -5485,23 +4902,23 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
}
/* ----------------------------------------------------------------------
- * composite_add_8000_8000
+ * composite_add_8_8
*/
static void
-sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
return FALSE;
}
- cache_prefetch ((__m128i*)src_bytes);
- cache_prefetch ((__m128i*)dst_bytes);
-
while (height--)
{
int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
dst_bytes += dst_stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
- /* 128 bytes ahead */
- cache_prefetch (((__m128i*)s) + 8);
- cache_prefetch (((__m128i*)d) + 8);
-
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
s += 16;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)mask);
-
m = *(uint32_t*) mask;
xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
m = *(uint32_t *) mask;
if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
dst++;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
__m128i tmp_lo, tmp_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)(dst + 4));
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
if (!is_transparent (xmm_mask))
@@ -6504,7 +5845,7 @@ static const pixman_fast_path_t sse2_fast_paths[] =
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
- PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index d4df81507..343aafee9 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -544,7 +544,7 @@ struct
tests_tbl[] =
{
{ "add_8_8_8", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 },
- { "add_n_8_8000", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 },
+ { "add_n_8_8", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 },
{ "add_n_8_8888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8r8g8b8 },
{ "add_n_8_x888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_x8r8g8b8 },
{ "add_n_8_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_r5g6b5 },
@@ -553,7 +553,7 @@ tests_tbl[] =
{ "add_n_8_2222", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a2r2g2b2 },
{ "add_n_8_2x10", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_x2r10g10b10 },
{ "add_n_8_2a10", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a2r10g10b10 },
- { "add_n_8000", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 },
+ { "add_n_8", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 },
{ "add_n_8888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
{ "add_n_x888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x8r8g8b8 },
{ "add_n_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_r5g6b5 },
@@ -562,7 +562,7 @@ tests_tbl[] =
{ "add_n_2222", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a2r2g2b2 },
{ "add_n_2x10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x2r10g10b10 },
{ "add_n_2a10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a2r10g10b10 },
- { "add_8000_8000", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 },
+ { "add_8_8", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 },
{ "add_x888_x888", PIXMAN_x8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x8r8g8b8 },
{ "add_8888_8888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
{ "add_8888_0565", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_r5g6b5 },