8 files changed, 823 insertions, 1466 deletions
diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S
index fe128aa94..108abacd1 100644
--- a/pixman/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman/pixman-arm-neon-asm.S
@@ -495,15 +495,15 @@ generate_composite_function \
 
 /******************************************************************************/
 
-.macro pixman_composite_add_8000_8000_process_pixblock_head
+.macro pixman_composite_add_8_8_process_pixblock_head
     vqadd.u8    q14, q0, q2
     vqadd.u8    q15, q1, q3
 .endm
 
-.macro pixman_composite_add_8000_8000_process_pixblock_tail
+.macro pixman_composite_add_8_8_process_pixblock_tail
 .endm
 
-.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
     vld1.8      {d0, d1, d2, d3}, [SRC]!
                                     PF add PF_X, PF_X, #32
                                     PF tst PF_CTL, #0xF
@@ -523,15 +523,15 @@ generate_composite_function \
 .endm
 
 generate_composite_function \
-    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
     FLAG_DST_READWRITE, \
     32, /* number of pixels, processed in a single block */ \
     10, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
-    pixman_composite_add_8000_8000_process_pixblock_tail_head
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
 
 /******************************************************************************/
 
@@ -561,8 +561,8 @@ generate_composite_function \
     10, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
     pixman_composite_add_8888_8888_process_pixblock_tail_head
 
 generate_composite_function_single_scanline \
@@ -571,8 +571,8 @@ generate_composite_function_single_scanline \
     8, /* number of pixels, processed in a single block */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
     pixman_composite_add_8888_8888_process_pixblock_tail_head
 
 /******************************************************************************/
diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c
index 28a66751a..231a183aa 100644
--- a/pixman/pixman/pixman-arm-neon.c
+++ b/pixman/pixman/pixman-arm-neon.c
@@ -52,7 +52,7 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
                                    uint8_t, 3, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
                                    uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8000_8000,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
@@ -257,7 +257,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S
index 1a1a0d641..76647c6bc 100644
--- a/pixman/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman/pixman-arm-simd-asm.S
@@ -1,330 +1,330 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-
-/* Prevent the stack from becoming executable */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-	.text
-	.arch armv6
-	.object_arch armv4
-	.arm
-	.altmacro
-
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-	.func fname
-	.global fname
-#ifdef __ELF__
-	.hidden fname
-	.type fname, %function
-#endif
-fname:
-.endm
-
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8000_8000_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	mov	r10, r1
-	sub	sp, sp, #4
-	subs	r10, r10, #1
-	mov	r11, r0
-	mov	r8, r2
-	str	r3, [sp]
-	ldr	r7, [sp, #36]
-	bcc	0f
-6:	cmp	r11, #0
-	beq	1f
-	orr	r3, r8, r7
-	tst	r3, #3
-	beq	2f
-	mov	r1, r8
-	mov	r0, r7
-	mov	r12, r11
-	b	3f
-5:	tst	r3, #3
-	beq	4f
-3:	ldrb	r2, [r0], #1
-	subs	r12, r12, #1
-	ldrb	r3, [r1]
-	uqadd8	r3, r2, r3
-	strb	r3, [r1], #1
-	orr	r3, r1, r0
-	bne	5b
-1:	ldr	r3, [sp]
-	add	r8, r8, r3
-	ldr	r3, [sp, #40]
-	add	r7, r7, r3
-10:	subs	r10, r10, #1
-	bcs	6b
-0:	add	sp, sp, #4
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-2:	mov	r12, r11
-	mov	r1, r8
-	mov	r0, r7
-4:	cmp	r12, #3
-	subgt	r6, r12, #4
-	movgt	r9, r12
-	lsrgt	r5, r6, #2
-	addgt	r3, r5, #1
-	movgt	r12, #0
-	lslgt	r4, r3, #2
-	ble	7f
-8:	ldr	r3, [r0, r12]
-	ldr	r2, [r1, r12]
-	uqadd8	r3, r3, r2
-	str	r3, [r1, r12]
-	add	r12, r12, #4
-	cmp	r12, r4
-	bne	8b
-	sub	r3, r9, #4
-	bic	r3, r3, #3
-	add	r3, r3, #4
-	subs	r12, r6, r5, lsl #2
-	add	r1, r1, r3
-	add	r0, r0, r3
-	beq	1b
-7:	mov	r4, #0
-9:	ldrb	r3, [r1, r4]
-	ldrb	r2, [r0, r4]
-	uqadd8	r3, r2, r3
-	strb	r3, [r1, r4]
-	add	r4, r4, #1
-	cmp	r4, r12
-	bne	9b
-	ldr	r3, [sp]
-	add	r8, r8, r3
-	ldr	r3, [sp, #40]
-	add	r7, r7, r3
-	b	10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #20
-	cmp	r1, #0
-	mov	r12, r2
-	str	r1, [sp, #12]
-	str	r0, [sp, #16]
-	ldr	r2, [sp, #52]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp]
-	ldr	r3, [sp, #56]
-	mov	r10, #0
-	lsl	r3, r3, #2
-	str	r3, [sp, #8]
-	mov	r11, r3
-	b	1f
-6:	ldr	r11, [sp, #8]
-1:	ldr	r9, [sp]
-	mov	r0, r12
-	add	r12, r12, r9
-	mov	r1, r2
-	str	r12, [sp, #4]
-	add	r2, r2, r11
-	ldr	r12, [sp, #16]
-	ldr	r3, =0x00800080
-	ldr	r9, =0xff00ff00
-	mov	r11, #255
-	cmp	r12, #0
-	beq	4f
-5:	ldr	r5, [r1], #4
-	ldr	r4, [r0]
-	sub	r8, r11, r5, lsr #24
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	mla	r6, r6, r8, r3
-	mla	r7, r7, r8, r3
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	and	r7, r7, r9
-	uxtab16	r6, r7, r6, ror #8
-	uqadd8	r5, r6, r5
-	str	r5, [r0], #4
-	subs	r12, r12, #1
-	bne	5b
-4:	ldr	r3, [sp, #12]
-	add	r10, r10, #1
-	cmp	r10, r3
-	ldr	r12, [sp, #4]
-	bne	6b
-0:	add	sp, sp, #20
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #28
-	cmp	r1, #0
-	str	r1, [sp, #12]
-	ldrb	r1, [sp, #71]
-	mov	r12, r2
-	str	r0, [sp, #16]
-	ldr	r2, [sp, #60]
-	str	r1, [sp, #24]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp, #20]
-	ldr	r3, [sp, #64]
-	mov	r10, #0
-	lsl	r3, r3, #2
-	str	r3, [sp, #8]
-	mov	r11, r3
-	b	1f
-5:	ldr	r11, [sp, #8]
-1:	ldr	r4, [sp, #20]
-	mov	r0, r12
-	mov	r1, r2
-	add	r12, r12, r4
-	add	r2, r2, r11
-	str	r12, [sp]
-	str	r2, [sp, #4]
-	ldr	r12, [sp, #16]
-	ldr	r2, =0x00800080
-	ldr	r3, [sp, #24]
-	mov	r11, #255
-	cmp	r12, #0
-	beq	3f
-4:	ldr	r5, [r1], #4
-	ldr	r4, [r0]
-	uxtb16	r6, r5
-	uxtb16	r7, r5, ror #8
-	mla	r6, r6, r3, r2
-	mla	r7, r7, r3, r2
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r5, r6, r7, lsl #8
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	sub	r8, r11, r5, lsr #24
-	mla	r6, r6, r8, r2
-	mla	r7, r7, r8, r2
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r6, r6, r7, lsl #8
-	uqadd8	r5, r6, r5
-	str	r5, [r0], #4
-	subs	r12, r12, #1
-	bne	4b
-3:	ldr	r1, [sp, #12]
-	add	r10, r10, #1
-	cmp	r10, r1
-	ldr	r12, [sp]
-	ldr	r2, [sp, #4]
-	bne	5b
-0:	add	sp, sp, #28
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #28
-	cmp	r1, #0
-	ldr	r9, [sp, #60]
-	str	r1, [sp, #12]
-	bic	r1, r9, #-16777216
-	str	r1, [sp, #20]
-	mov	r12, r2
-	lsr	r1, r9, #8
-	ldr	r2, [sp, #20]
-	bic	r1, r1, #-16777216
-	bic	r2, r2, #65280
-	bic	r1, r1, #65280
-	str	r2, [sp, #20]
-	str	r0, [sp, #16]
-	str	r1, [sp, #4]
-	ldr	r2, [sp, #68]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp, #24]
-	mov	r0, #0
-	b	1f
-5:	ldr	r3, [sp, #24]
-1:	ldr	r4, [sp, #72]
-	mov	r10, r12
-	mov	r1, r2
-	add	r12, r12, r3
-	add	r2, r2, r4
-	str	r12, [sp, #8]
-	str	r2, [sp]
-	ldr	r12, [sp, #16]
-	ldr	r11, =0x00800080
-	ldr	r2, [sp, #4]
-	ldr	r3, [sp, #20]
-	cmp	r12, #0
-	beq	3f
-4:	ldrb	r5, [r1], #1
-	ldr	r4, [r10]
-	mla	r6, r3, r5, r11
-	mla	r7, r2, r5, r11
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r5, r6, r7, lsl #8
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	mvn	r8, r5
-	lsr	r8, r8, #24
-	mla	r6, r6, r8, r11
-	mla	r7, r7, r8, r11
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r6, r6, r7, lsl #8
-	uqadd8	r5, r6, r5
-	str	r5, [r10], #4
-	subs	r12, r12, #1
-	bne	4b
-3:	ldr	r4, [sp, #12]
-	add	r0, r0, #1
-	cmp	r0, r4
-	ldr	r12, [sp, #8]
-	ldr	r2, [sp]
-	bne	5b
-0:	add	sp, sp, #28
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	mov	r10, r1
+	sub	sp, sp, #4
+	subs	r10, r10, #1
+	mov	r11, r0
+	mov	r8, r2
+	str	r3, [sp]
+	ldr	r7, [sp, #36]
+	bcc	0f
+6:	cmp	r11, #0
+	beq	1f
+	orr	r3, r8, r7
+	tst	r3, #3
+	beq	2f
+	mov	r1, r8
+	mov	r0, r7
+	mov	r12, r11
+	b	3f
+5:	tst	r3, #3
+	beq	4f
+3:	ldrb	r2, [r0], #1
+	subs	r12, r12, #1
+	ldrb	r3, [r1]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1], #1
+	orr	r3, r1, r0
+	bne	5b
+1:	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+10:	subs	r10, r10, #1
+	bcs	6b
+0:	add	sp, sp, #4
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+2:	mov	r12, r11
+	mov	r1, r8
+	mov	r0, r7
+4:	cmp	r12, #3
+	subgt	r6, r12, #4
+	movgt	r9, r12
+	lsrgt	r5, r6, #2
+	addgt	r3, r5, #1
+	movgt	r12, #0
+	lslgt	r4, r3, #2
+	ble	7f
+8:	ldr	r3, [r0, r12]
+	ldr	r2, [r1, r12]
+	uqadd8	r3, r3, r2
+	str	r3, [r1, r12]
+	add	r12, r12, #4
+	cmp	r12, r4
+	bne	8b
+	sub	r3, r9, #4
+	bic	r3, r3, #3
+	add	r3, r3, #4
+	subs	r12, r6, r5, lsl #2
+	add	r1, r1, r3
+	add	r0, r0, r3
+	beq	1b
+7:	mov	r4, #0
+9:	ldrb	r3, [r1, r4]
+	ldrb	r2, [r0, r4]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1, r4]
+	add	r4, r4, #1
+	cmp	r4, r12
+	bne	9b
+	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+	b	10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #20
+	cmp	r1, #0
+	mov	r12, r2
+	str	r1, [sp, #12]
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #52]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp]
+	ldr	r3, [sp, #56]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+6:	ldr	r11, [sp, #8]
+1:	ldr	r9, [sp]
+	mov	r0, r12
+	add	r12, r12, r9
+	mov	r1, r2
+	str	r12, [sp, #4]
+	add	r2, r2, r11
+	ldr	r12, [sp, #16]
+	ldr	r3, =0x00800080
+	ldr	r9, =0xff00ff00
+	mov	r11, #255
+	cmp	r12, #0
+	beq	4f
+5:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	sub	r8, r11, r5, lsr #24
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mla	r6, r6, r8, r3
+	mla	r7, r7, r8, r3
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	and	r7, r7, r9
+	uxtab16	r6, r7, r6, ror #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	5b
+4:	ldr	r3, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r3
+	ldr	r12, [sp, #4]
+	bne	6b
+0:	add	sp, sp, #20
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	str	r1, [sp, #12]
+	ldrb	r1, [sp, #71]
+	mov	r12, r2
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #60]
+	str	r1, [sp, #24]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #20]
+	ldr	r3, [sp, #64]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+5:	ldr	r11, [sp, #8]
+1:	ldr	r4, [sp, #20]
+	mov	r0, r12
+	mov	r1, r2
+	add	r12, r12, r4
+	add	r2, r2, r11
+	str	r12, [sp]
+	str	r2, [sp, #4]
+	ldr	r12, [sp, #16]
+	ldr	r2, =0x00800080
+	ldr	r3, [sp, #24]
+	mov	r11, #255
+	cmp	r12, #0
+	beq	3f
+4:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	uxtb16	r6, r5
+	uxtb16	r7, r5, ror #8
+	mla	r6, r6, r3, r2
+	mla	r7, r7, r3, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	sub	r8, r11, r5, lsr #24
+	mla	r6, r6, r8, r2
+	mla	r7, r7, r8, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r1, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r1
+	ldr	r12, [sp]
+	ldr	r2, [sp, #4]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	ldr	r9, [sp, #60]
+	str	r1, [sp, #12]
+	bic	r1, r9, #-16777216
+	str	r1, [sp, #20]
+	mov	r12, r2
+	lsr	r1, r9, #8
+	ldr	r2, [sp, #20]
+	bic	r1, r1, #-16777216
+	bic	r2, r2, #65280
+	bic	r1, r1, #65280
+	str	r2, [sp, #20]
+	str	r0, [sp, #16]
+	str	r1, [sp, #4]
+	ldr	r2, [sp, #68]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #24]
+	mov	r0, #0
+	b	1f
+5:	ldr	r3, [sp, #24]
+1:	ldr	r4, [sp, #72]
+	mov	r10, r12
+	mov	r1, r2
+	add	r12, r12, r3
+	add	r2, r2, r4
+	str	r12, [sp, #8]
+	str	r2, [sp]
+	ldr	r12, [sp, #16]
+	ldr	r11, =0x00800080
+	ldr	r2, [sp, #4]
+	ldr	r3, [sp, #20]
+	cmp	r12, #0
+	beq	3f
+4:	ldrb	r5, [r1], #1
+	ldr	r4, [r10]
+	mla	r6, r3, r5, r11
+	mla	r7, r2, r5, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mvn	r8, r5
+	lsr	r8, r8, #24
+	mla	r6, r6, r8, r11
+	mla	r7, r7, r8, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r10], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r4, [sp, #12]
+	add	r0, r0, #1
+	cmp	r0, r4
+	ldr	r12, [sp, #8]
+	ldr	r2, [sp]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
index 389c9e01a..76a7ffeab 100644
--- a/pixman/pixman/pixman-arm-simd.c
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -1,417 +1,417 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include "pixman-private.h"
-#include "pixman-arm-common.h"
-
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8000_8000_asm_armv6 (int32_t  width,
-                                          int32_t  height,
-                                          uint8_t *dst_line,
-                                          int32_t  dst_stride,
-                                          uint8_t *src_line,
-                                          int32_t  src_stride)
-{
-    uint8_t *dst, *src;
-    int32_t w;
-    uint8_t s, d;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	/* ensure both src and dst are properly aligned before doing 32 bit reads
-	 * we'll stay in this loop if src and dst have differing alignments
-	 */
-	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
-
-	    dst++;
-	    src++;
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    asm ("uqadd8 %0, %1, %2"
-		 : "=r" (*(uint32_t*)dst)
-		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
-	    dst += 4;
-	    src += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
-
-	    dst++;
-	    src++;
-	    w--;
-	}
-    }
-
-}
-
-void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
-                                           int32_t   height,
-                                           uint32_t *dst_line,
-                                           int32_t   dst_stride,
-                                           uint32_t *src_line,
-                                           int32_t   src_stride)
-{
-    uint32_t    *dst;
-    uint32_t    *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t upper_component_mask = 0xff00ff00;
-    uint32_t alpha_mask = 0xff;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    "ldr r4, [%[dest]] \n\t"
-
-#else
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* multiply by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
-	    "and r7, r7, %[upper_component_mask]\n\t"
-	    "uxtab16 r6, r7, r6, ror #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-	    );
-    }
-}
-
-void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
-                                             int32_t   height,
-                                             uint32_t *dst_line,
-                                             int32_t   dst_stride,
-                                             uint32_t *src_line,
-                                             int32_t   src_stride,
-                                             uint32_t  mask)
-{
-    uint32_t *dst;
-    uint32_t *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t alpha_mask = 0xff;
-
-    mask = (mask) >> 24;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    "uxtb16 r6, r5\n\t"
-	    "uxtb16 r7, r5, ror #8\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
-	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
-	    );
-    }
-}
-
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
-                                          int32_t   height,
-                                          uint32_t *dst_line,
-                                          int32_t   dst_stride,
-                                          uint32_t  src,
-                                          int32_t   unused,
-                                          uint8_t  *mask_line,
-                                          int32_t   mask_stride)
-{
-    uint32_t  srca;
-    uint32_t *dst;
-    uint8_t  *mask;
-    int32_t w;
-
-    srca = src >> 24;
-
-    uint32_t component_mask = 0xff00ff;
-    uint32_t component_half = 0x800080;
-
-    uint32_t src_hi = (src >> 8) & component_mask;
-    uint32_t src_lo = src & component_mask;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load mask */
-	    "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
-	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* we could simplify this to use 'sub' if we were
-	     * willing to give up a register for alpha_mask
-	     */
-	    "mvn r8, r5\n\t"
-	    "mov r8, r8, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
-	    : [component_half] "r" (component_half),
-	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
-    }
-}
-
-#endif
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000,
-                                   uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
-                                     uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
-                                      uint8_t, 1, uint32_t, 1)
-
-static const pixman_fast_path_t arm_simd_fast_paths[] =
-{
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
-
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8000_8000),
-
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
-
-    { PIXMAN_OP_NONE },
-};
-
-pixman_implementation_t *
-_pixman_implementation_create_arm_simd (void)
-{
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
-
-    return imp;
-}
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t  width,
+				    int32_t  height,
+				    uint8_t *dst_line,
+				    int32_t  dst_stride,
+				    uint8_t *src_line,
+				    int32_t  src_stride)
+{
+    uint8_t *dst, *src;
+    int32_t w;
+    uint8_t s, d;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* ensure both src and dst are properly aligned before doing 32 bit reads
+	 * we'll stay in this loop if src and dst have differing alignments
+	 */
+	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    asm ("uqadd8 %0, %1, %2"
+		 : "=r" (*(uint32_t*)dst)
+		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
+                                           int32_t   height,
+                                           uint32_t *dst_line,
+                                           int32_t   dst_stride,
+                                           uint32_t *src_line,
+                                           int32_t   src_stride)
+{
+    uint32_t    *dst;
+    uint32_t    *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t upper_component_mask = 0xff00ff00;
+    uint32_t alpha_mask = 0xff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    "ldr r4, [%[dest]] \n\t"
+
+#else
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* multiply by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
+	    "and r7, r7, %[upper_component_mask]\n\t"
+	    "uxtab16 r6, r7, r6, ror #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
+                                             int32_t   height,
+                                             uint32_t *dst_line,
+                                             int32_t   dst_stride,
+                                             uint32_t *src_line,
+                                             int32_t   src_stride,
+                                             uint32_t  mask)
+{
+    uint32_t *dst;
+    uint32_t *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t alpha_mask = 0xff;
+
+    mask = (mask) >> 24;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    "uxtb16 r6, r5\n\t"
+	    "uxtb16 r7, r5, ror #8\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
+                                          int32_t   height,
+                                          uint32_t *dst_line,
+                                          int32_t   dst_stride,
+                                          uint32_t  src,
+                                          int32_t   unused,
+                                          uint8_t  *mask_line,
+                                          int32_t   mask_stride)
+{
+    uint32_t  srca;
+    uint32_t *dst;
+    uint8_t  *mask;
+    int32_t w;
+
+    srca = src >> 24;
+
+    uint32_t component_mask = 0xff00ff;
+    uint32_t component_half = 0x800080;
+
+    uint32_t src_hi = (src >> 8) & component_mask;
+    uint32_t src_lo = src & component_mask;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load mask */
+	    "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
+	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* we could simplify this to use 'sub' if we were
+	     * willing to give up a register for alpha_mask
+	     */
+	    "mvn r8, r5\n\t"
+	    "mov r8, r8, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+	    : [component_half] "r" (component_half),
+	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (void)
+{
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 0b8a2526e..25ef9243b 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -910,19 +910,19 @@ fast_composite_src_x888_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_add_8000_8000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
@@ -1602,7 +1602,7 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c
index c4e7fb9fb..0b98282cb 100644
--- a/pixman/pixman/pixman-mmx.c
+++ b/pixman/pixman/pixman-mmx.c
@@ -152,6 +152,9 @@ to_m64 (uint64_t x)
 #endif
 }
 
+#ifdef _MSC_VER
+#define to_uint64(arg)  arg.M64_MEMBER
+#else
 static force_inline uint64_t
 to_uint64 (__m64 x)
 {
@@ -164,6 +167,7 @@ to_uint64 (__m64 x)
     return (uint64_t)x;
 #endif
 }
+#endif
 
 static force_inline __m64
 shift (__m64 v,
@@ -310,11 +314,15 @@ pack8888 (__m64 lo, __m64 hi)
     return _mm_packs_pu16 (lo, hi);
 }
 
+#ifdef _MSC_VER
+#define store8888(v) _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()))
+#else
 static force_inline uint32_t
 store8888 (__m64 v)
 {
     return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
 }
+#endif
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
  *
@@ -417,6 +425,13 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 
 /* --------------- MMX code patch for fbcompose.c --------------------- */
 
+#ifdef _MSC_VER
+#define combine(src, mask)                                                        \
+  ((mask) ?                                                                         \
+      store8888 (pix_multiply (load8888 (*src), expand_alpha (load8888 (*mask))))  \
+    :                                                                              \
+      *src)
+#else
 static force_inline uint32_t
 combine (const uint32_t *src, const uint32_t *mask)
 {
@@ -435,6 +450,7 @@ combine (const uint32_t *src, const uint32_t *mask)
 
     return ssrc;
 }
+#endif
 
 static void
 mmx_combine_over_u (pixman_implementation_t *imp,
@@ -448,7 +464,7 @@ mmx_combine_over_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	uint32_t ssrc = combine (src, mask);
+	uint32_t ssrc = combine( src, mask);
 	uint32_t a = ssrc >> 24;
 
 	if (a == 0xff)
@@ -2845,19 +2861,19 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
 }
 
 static void
-mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_op_t              op,
+		       pixman_image_t *         src_image,
+		       pixman_image_t *         mask_image,
+		       pixman_image_t *         dst_image,
+		       int32_t                  src_x,
+		       int32_t                  src_y,
+		       int32_t                  mask_x,
+		       int32_t                  mask_y,
+		       int32_t                  dest_x,
+		       int32_t                  dest_y,
+		       int32_t                  width,
+		       int32_t                  height)
 {
     uint8_t *dst_line, *dst;
     uint8_t *src_line, *src;
@@ -3268,7 +3284,7 @@ static const pixman_fast_path_t mmx_fast_paths[] =
 
     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
-    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8000_8000       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
 
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 8e175b78d..9d9b16543 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch_next (addr);
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
 	 */
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
 	 */
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
                                  const uint32_t* pm,
                                  int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t*       pd,
                          const uint32_t* pm,
                          int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i s;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
 	save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    *dst++ = *src++ | 0xff000000;
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 16)
 	{
 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
 	    
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	/* It's a 8 pixel loop */
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* I'm loading unaligned because I'm not sure
 	     * about the address alignment.
 	     */
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	byte_line += stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 1 && ((unsigned long)d & 1))
 	{
 	    *(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 128)
 	{
-	    cache_prefetch (((__m128i*)d) + 12);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
 	if (w >= 64)
 	{
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	if (w >= 32)
 	{
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 16;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
 	    opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	dst_line += dst_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    mask++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    *dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    save_128_aligned (
 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
@@ -5485,23 +4902,23 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 }
 
 /* ----------------------------------------------------------------------
- * composite_add_8000_8000
+ * composite_add_8_8
  */
 
 static void
-sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
 	int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 64)
 	{
 	    __m128i xmm0, xmm1, xmm2, xmm3;
 
-	    /* 128 bytes ahead */
-	    cache_prefetch (((__m128i*)s) + 8);
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    xmm0 = load_128_unaligned ((__m128i*)(s));
 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 16)
 	{
 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    s += 16;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
             m = *(uint32_t *) mask;
 
 	    if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
 	    __m128i tmp_lo, tmp_hi;
 
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)(dst + 4));
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 
 	    if (!is_transparent (xmm_mask))
@@ -6504,7 +5845,7 @@ static const pixman_fast_path_t sse2_fast_paths[] =
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index d4df81507..343aafee9 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -544,7 +544,7 @@ struct
 tests_tbl[] =
 {
     { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
-    { "add_n_8_8000",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
     { "add_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
     { "add_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
     { "add_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
@@ -553,7 +553,7 @@ tests_tbl[] =
     { "add_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
     { "add_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
     { "add_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
-    { "add_n_8000",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_n_8",               PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
     { "add_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "add_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "add_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
@@ -562,7 +562,7 @@ tests_tbl[] =
     { "add_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
     { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
     { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
-    { "add_8000_8000",         PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_8_8",               PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
     { "add_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "add_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "add_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },