10 files changed, 1928 insertions, 825 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 88ff897be..5137c9ea3 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -10,7 +10,7 @@ snapshot:
 	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
 	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
 
-GPGKEY=6FF7C1A8
+GPGKEY=3892336E
 USERNAME=$$USER
 RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
 RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
@@ -121,7 +121,7 @@ release-publish-message: $(HASHFILES) ensure-prev
 	@echo ""
 	@echo "GPG signature:"
 	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
-	@echo "	(signed by `git config --get user.name` <`git config --get user.email`>)"
+	@echo "	(signed by`gpg --list-keys $(GPGKEY) | grep uid | cut -b4- | tr -s " "`)"
 	@echo ""
 	@echo "Git:"
 	@echo "	git://git.freedesktop.org/git/pixman"
diff --git a/pixman/configure.ac b/pixman/configure.ac
index a93e2905b..38f89b31e 100644
--- a/pixman/configure.ac
+++ b/pixman/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
 
 m4_define([pixman_major], 0)
 m4_define([pixman_minor], 29)
-m4_define([pixman_micro], 1)
+m4_define([pixman_micro], 3)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am
index beebdd086..b9ea75424 100644
--- a/pixman/pixman/Makefile.am
+++ b/pixman/pixman/Makefile.am
@@ -58,7 +58,9 @@ noinst_LTLIBRARIES += libpixman-arm-simd.la
 libpixman_arm_simd_la_SOURCES = \
 	pixman-arm-simd.c	\
 	pixman-arm-common.h	\
-	pixman-arm-simd-asm.S
+	pixman-arm-simd-asm.S   \
+	pixman-arm-simd-asm-scaled.S \
+	pixman-arm-simd-asm.h
 libpixman_1_la_LIBADD += libpixman-arm-simd.la
 
 ASM_CFLAGS_arm_simd=
diff --git a/pixman/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman/pixman-arm-simd-asm-scaled.S
new file mode 100644
index 000000000..711099548
--- /dev/null
+++ b/pixman/pixman/pixman-arm-simd-asm-scaled.S
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ *  fname                     - name of the function to generate
+ *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
+ *  t                         - type suffix for LDR/STR instructions
+ *  prefetch_distance         - prefetch in the source image by that many
+ *                              pixels ahead
+ *  prefetch_braking_distance - stop prefetching when that many pixels are
+ *                              remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
+                                      prefetch_distance,        \
+                                      prefetch_braking_distance
+
+pixman_asm_function fname
+	W		.req	r0
+	DST		.req	r1
+	SRC		.req	r2
+	VX		.req	r3
+	UNIT_X		.req	ip
+	TMP1		.req	r4
+	TMP2		.req	r5
+	VXMASK		.req	r6
+	PF_OFFS		.req	r7
+	SRC_WIDTH_FIXED	.req	r8
+
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7, r8, r10}
+	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	ldr	SRC_WIDTH_FIXED, [sp, #28]
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldr&t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
+		str&t	TMP1, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
+
+		ldr&t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
+		str&t	TMP2, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	adds	VX, VX, UNIT_X
+9:	subpls	VX, VX, SRC_WIDTH_FIXED
+	bpl	9b
+	subs	W, W, #(8 + prefetch_braking_distance)
+	blt	2f
+	/* calculate prefetch offset */
+	mov	PF_OFFS, #prefetch_distance
+	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
+1:	/* main loop, process 8 pixels per iteration with prefetch */
+	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
+	add	PF_OFFS, UNIT_X, lsl #3
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #8
+	bge	1b
+2:
+	subs	W, W, #(4 - 8 - prefetch_braking_distance)
+	blt	2f
+1:	/* process the remaining pixels */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrne&t	TMP1, [SRC, TMP1]
+	strne&t	TMP1, [DST]
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	VXMASK
+	.unreq	PF_OFFS
+	.unreq  SRC_WIDTH_FIXED
+	/* return */
+	pop	{r4, r5, r6, r7, r8, r10}
+	bx	lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S
index b438001d3..c20968879 100644
--- a/pixman/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman/pixman-arm-simd-asm.S
@@ -1,14 +1,14 @@
 /*
- * Copyright © 2008 Mozilla Corporation
- * Copyright © 2010 Nokia Corporation
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
  * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
+ * documentation, and that the name of the copyright holders not be used in
  * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
+ * specific, written prior permission.  The copyright holders make no
  * representations about the suitability of this software for any purpose.  It
  * is provided "as is" without express or implied warranty.
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
- * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ * Author:  Ben Avison (bavison@riscosopen.org)
  *
  */
 
@@ -37,412 +37,577 @@
 	.altmacro
 	.p2align 2
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-	.func fname
-	.global fname
-#ifdef __ELF__
-	.hidden fname
-	.type fname, %function
-#endif
-fname:
-.endm
+#include "pixman-arm-simd-asm.h"
 
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ *   cond           ARM condition code for code block
+ *   numbytes       Number of output bytes that should be generated this time
+ *   firstreg       First WK register in which to place output
+ *   unaligned_src  Whether to use non-wordaligned loads of source image
+ *   unaligned_mask Whether to use non-wordaligned loads of mask image
+ *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
  */
 
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	mov	r10, r1
-	sub	sp, sp, #4
-	subs	r10, r10, #1
-	mov	r11, r0
-	mov	r8, r2
-	str	r3, [sp]
-	ldr	r7, [sp, #36]
-	bcc	0f
-6:	cmp	r11, #0
-	beq	1f
-	orr	r3, r8, r7
-	tst	r3, #3
-	beq	2f
-	mov	r1, r8
-	mov	r0, r7
-	mov	r12, r11
-	b	3f
-5:	tst	r3, #3
-	beq	4f
-3:	ldrb	r2, [r0], #1
-	subs	r12, r12, #1
-	ldrb	r3, [r1]
-	uqadd8	r3, r2, r3
-	strb	r3, [r1], #1
-	orr	r3, r1, r0
-	bne	5b
-1:	ldr	r3, [sp]
-	add	r8, r8, r3
-	ldr	r3, [sp, #40]
-	add	r7, r7, r3
-10:	subs	r10, r10, #1
-	bcs	6b
-0:	add	sp, sp, #4
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-2:	mov	r12, r11
-	mov	r1, r8
-	mov	r0, r7
-4:	cmp	r12, #3
-	subgt	r6, r12, #4
-	movgt	r9, r12
-	lsrgt	r5, r6, #2
-	addgt	r3, r5, #1
-	movgt	r12, #0
-	lslgt	r4, r3, #2
-	ble	7f
-8:	ldr	r3, [r0, r12]
-	ldr	r2, [r1, r12]
-	uqadd8	r3, r3, r2
-	str	r3, [r1, r12]
-	add	r12, r12, #4
-	cmp	r12, r4
-	bne	8b
-	sub	r3, r9, #4
-	bic	r3, r3, #3
-	add	r3, r3, #4
-	subs	r12, r6, r5, lsl #2
-	add	r1, r1, r3
-	add	r0, r0, r3
-	beq	1b
-7:	mov	r4, #0
-9:	ldrb	r3, [r1, r4]
-	ldrb	r2, [r0, r4]
-	uqadd8	r3, r2, r3
-	strb	r3, [r1, r4]
-	add	r4, r4, #1
-	cmp	r4, r12
-	bne	9b
-	ldr	r3, [sp]
-	add	r8, r8, r3
-	ldr	r3, [sp, #40]
-	add	r7, r7, r3
-	b	10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #20
-	cmp	r1, #0
-	mov	r12, r2
-	str	r1, [sp, #12]
-	str	r0, [sp, #16]
-	ldr	r2, [sp, #52]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp]
-	ldr	r3, [sp, #56]
-	mov	r10, #0
-	lsl	r3, r3, #2
-	str	r3, [sp, #8]
-	mov	r11, r3
-	b	1f
-6:	ldr	r11, [sp, #8]
-1:	ldr	r9, [sp]
-	mov	r0, r12
-	add	r12, r12, r9
-	mov	r1, r2
-	str	r12, [sp, #4]
-	add	r2, r2, r11
-	ldr	r12, [sp, #16]
-	ldr	r3, =0x00800080
-	ldr	r9, =0xff00ff00
-	mov	r11, #255
-	cmp	r12, #0
-	beq	4f
-5:	ldr	r5, [r1], #4
-	ldr	r4, [r0]
-	sub	r8, r11, r5, lsr #24
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	mla	r6, r6, r8, r3
-	mla	r7, r7, r8, r3
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	and	r7, r7, r9
-	uxtab16	r6, r7, r6, ror #8
-	uqadd8	r5, r6, r5
-	str	r5, [r0], #4
-	subs	r12, r12, #1
-	bne	5b
-4:	ldr	r3, [sp, #12]
-	add	r10, r10, #1
-	cmp	r10, r3
-	ldr	r12, [sp, #4]
-	bne	6b
-0:	add	sp, sp, #20
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #28
-	cmp	r1, #0
-	str	r1, [sp, #12]
-	ldrb	r1, [sp, #71]
-	mov	r12, r2
-	str	r0, [sp, #16]
-	ldr	r2, [sp, #60]
-	str	r1, [sp, #24]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp, #20]
-	ldr	r3, [sp, #64]
-	mov	r10, #0
-	lsl	r3, r3, #2
-	str	r3, [sp, #8]
-	mov	r11, r3
-	b	1f
-5:	ldr	r11, [sp, #8]
-1:	ldr	r4, [sp, #20]
-	mov	r0, r12
-	mov	r1, r2
-	add	r12, r12, r4
-	add	r2, r2, r11
-	str	r12, [sp]
-	str	r2, [sp, #4]
-	ldr	r12, [sp, #16]
-	ldr	r2, =0x00800080
-	ldr	r3, [sp, #24]
-	mov	r11, #255
-	cmp	r12, #0
-	beq	3f
-4:	ldr	r5, [r1], #4
-	ldr	r4, [r0]
-	uxtb16	r6, r5
-	uxtb16	r7, r5, ror #8
-	mla	r6, r6, r3, r2
-	mla	r7, r7, r3, r2
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r5, r6, r7, lsl #8
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	sub	r8, r11, r5, lsr #24
-	mla	r6, r6, r8, r2
-	mla	r7, r7, r8, r2
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r6, r6, r7, lsl #8
-	uqadd8	r5, r6, r5
-	str	r5, [r0], #4
-	subs	r12, r12, #1
-	bne	4b
-3:	ldr	r1, [sp, #12]
-	add	r10, r10, #1
-	cmp	r10, r1
-	ldr	r12, [sp]
-	ldr	r2, [sp, #4]
-	bne	5b
-0:	add	sp, sp, #28
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
-	push	{r4, r5, r6, r7, r8, r9, r10, r11}
-	sub	sp, sp, #28
-	cmp	r1, #0
-	ldr	r9, [sp, #60]
-	str	r1, [sp, #12]
-	bic	r1, r9, #-16777216
-	str	r1, [sp, #20]
-	mov	r12, r2
-	lsr	r1, r9, #8
-	ldr	r2, [sp, #20]
-	bic	r1, r1, #-16777216
-	bic	r2, r2, #65280
-	bic	r1, r1, #65280
-	str	r2, [sp, #20]
-	str	r0, [sp, #16]
-	str	r1, [sp, #4]
-	ldr	r2, [sp, #68]
-	beq	0f
-	lsl	r3, r3, #2
-	str	r3, [sp, #24]
-	mov	r0, #0
-	b	1f
-5:	ldr	r3, [sp, #24]
-1:	ldr	r4, [sp, #72]
-	mov	r10, r12
-	mov	r1, r2
-	add	r12, r12, r3
-	add	r2, r2, r4
-	str	r12, [sp, #8]
-	str	r2, [sp]
-	ldr	r12, [sp, #16]
-	ldr	r11, =0x00800080
-	ldr	r2, [sp, #4]
-	ldr	r3, [sp, #20]
-	cmp	r12, #0
-	beq	3f
-4:	ldrb	r5, [r1], #1
-	ldr	r4, [r10]
-	mla	r6, r3, r5, r11
-	mla	r7, r2, r5, r11
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r5, r6, r7, lsl #8
-	uxtb16	r6, r4
-	uxtb16	r7, r4, ror #8
-	mvn	r8, r5
-	lsr	r8, r8, #24
-	mla	r6, r6, r8, r11
-	mla	r7, r7, r8, r11
-	uxtab16	r6, r6, r6, ror #8
-	uxtab16	r7, r7, r7, ror #8
-	uxtb16	r6, r6, ror #8
-	uxtb16	r7, r7, ror #8
-	orr	r6, r6, r7, lsl #8
-	uqadd8	r5, r6, r5
-	str	r5, [r10], #4
-	subs	r12, r12, #1
-	bne	4b
-3:	ldr	r4, [sp, #12]
-	add	r0, r0, #1
-	cmp	r0, r4
-	ldr	r12, [sp, #8]
-	ldr	r2, [sp]
-	bne	5b
-0:	add	sp, sp, #28
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
-	bx	lr
-.endfunc
+.macro blit_init
+        line_saved_regs STRIDE_D, STRIDE_S
+.endm
 
-/*
- * Note: This code is only using armv5te instructions (not even armv6),
- *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
- *       be split into a few variants, tuned for each microarchitecture.
- *
- * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
- * have efficient write combining), it needs to be changed to use 16-byte
- * aligned writes using STM instruction.
- *
- * Nearest scanline scaler macro template uses the following arguments:
- *  fname                     - name of the function to generate
- *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
- *  t                         - type suffix for LDR/STR instructions
- *  prefetch_distance         - prefetch in the source image by that many
- *                              pixels ahead
- *  prefetch_braking_distance - stop prefetching when that many pixels are
- *                              remaining before the end of scanline
+.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+110:    pixld   , 16, 0, SRC, unaligned_src
+        pixld   , 16, 4, SRC, unaligned_src
+        pld     [SRC, SCRATCH]
+        pixst   , 16, 0, DST
+        pixst   , 16, 4, DST
+        subs    X, X, #32*8/src_bpp
+        bhs     110b
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    4, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    4, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+generate_composite_function \
+    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    3, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+/******************************************************************************/
+
+.macro src_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #8
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail  cond, numbytes, firstreg
+    WK4     .req    SRC
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+        pixst   cond, numbytes, 4, DST
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8888_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_0565_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+/******************************************************************************/
+
+.macro src_x888_8888_pixel, cond, reg
+        orr&cond WK&reg, WK&reg, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
+        src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+        src_x888_8888_pixel cond, %(firstreg+1)
+  .if numbytes == 16
+        src_x888_8888_pixel cond, %(firstreg+2)
+        src_x888_8888_pixel cond, %(firstreg+3)
+  .endif
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    pixman_composite_src_x888_8888_process_head, \
+    pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+        /* Hold loop invariants in MASK and STRIDE_M */
+        ldr     MASK, =0x07E007E0
+        mov     STRIDE_M, #0xFF000000
+        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+        ldr     SCRATCH, =0x80008000
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
+        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
+        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
+        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
+        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
+        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
+        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
+        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+        pixld   , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+        pixld   , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+        src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+    3, /* prefetch distance */ \
+    src_0565_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_0565_8888_process_head, \
+    src_0565_8888_process_tail
+
+/******************************************************************************/
+
+.macro add_8_8_8pixels  cond, dst1, dst2
+        uqadd8&cond  WK&dst1, WK&dst1, MASK
+        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels  cond, dst
+        uqadd8&cond  WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    MASK
+    WK5     .req    STRIDE_M
+ .if numbytes == 16
+        pixld   cond, 8, 4, SRC, unaligned_src
+        pixld   cond, 16, firstreg, DST, 0
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+        pixld   cond, 8, 4, SRC, unaligned_src
+ .else
+        pixld   cond, numbytes, 4, SRC, unaligned_src
+        pixld   cond, numbytes, firstreg, DST, 0
+ .endif
+    .unreq  WK4
+    .unreq  WK5
+.endm
+
+.macro add_8_8_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+        add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    add_8_8_process_head, \
+    add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
+        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+        teq     WK&reg0, #0
+ .if numbytes > 4
+        teqeq   WK&reg1, #0
+  .if numbytes > 8
+        teqeq   WK&reg2, #0
+        teqeq   WK&reg3, #0
+  .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare  next
+        mov     WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+        /* src = destination component multiplier */
+        rsb     WK&src, WK&src, #255
+        /* Split even/odd bytes of dst into SCRATCH/dst */
+        uxtb16  SCRATCH, WK&dst
+        uxtb16  WK&dst, WK&dst, ror #8
+        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+        mla     SCRATCH, SCRATCH, WK&src, MASK
+        mla     WK&dst, WK&dst, WK&src, MASK
+        /* Where we would have had a stall between the result of the first MLA and the shifter input,
+         * reload the complete source pixel */
+        ldr     WK&src, [SRC, #offset]
+        /* Multiply by 257/256 to approximate 256/255 */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        /* In this stall, start processing the next pixel */
+ .if offset < -4
+        mov     WK&next, WK&next, lsr #24
+ .endif
+        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        /* Recombine even/odd bytes of multiplied destination */
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     WK&dst, SCRATCH, WK&dst
+        /* Saturated add of source to multiplied destination */
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        over_8888_8888_prepare  %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+  .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_8888_process_head, \
+    over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other instructions.
+ * word  Register containing 4 bytes
+ * byte  Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp   Scratch register
+ * half  Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
  */
+.macro mul_8888_8  word, byte, tmp, half
+        /* Split even/odd bytes of word apart */
+        uxtb16  tmp, word
+        uxtb16  word, word, ror #8
+        /* Multiply bytes together with rounding, then by 257/256 */
+        mla     tmp, tmp, byte, half
+        mla     word, word, byte, half /* 1 stall follows */
+        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
+        uxtab16 word, word, word, ror #8
+        /* Recombine bytes */
+        mov     tmp, tmp, ror #8
+        sel     word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+        /* Mask is constant */
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        /* Hold loop invariant in STRIDE_M */
+        ldr     STRIDE_M, =0x00800080
+        /* We only want the alpha bits of the constant mask */
+        mov     MASK, MASK, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, STRIDE_M, STRIDE_M
+        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK&src, lsr #24
+        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        mov     WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+  .if numbytes == 16 && PROCESS_REG == 2
+        /* We're using WK6 and WK7 as temporaries, so half way through
+         * 4 pixels, reload the second two source pixels but this time
+         * into WK4 and WK5 */
+        ldmdb   SRC, {WK4, WK5}
+  .endif
+        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_n_8888_process_head, \
+    over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
+        ldr     SCRATCH, =0x00800080
+        uxtb16  STRIDE_S, SRC
+        uxtb16  SRC, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+        ldr     STRIDE_D, =0x00800080
+        b       1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_M
+        pixld   , numbytes/4, 4, MASK, unaligned_mask
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+        uxtb    Y, WK4, ror #src*8
+        /* Trailing part of multiplication of source */
+        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
+        mla     Y, SRC, Y, STRIDE_D
+        mov     ORIG_W, #255
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 Y, Y, Y, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sub     ORIG_W, ORIG_W, Y, lsr #24
+        sel     Y, SCRATCH, Y
+        /* Then multiply the destination */
+        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_M
+        teq     WK4, #0
+        beq     10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_n_8_8888_init, \
+    over_n_8_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_n_8_8888_process_head, \
+    over_n_8_8888_process_tail
+
+/******************************************************************************/
 
-.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
-                                      prefetch_distance,        \
-                                      prefetch_braking_distance
-
-pixman_asm_function fname
-	W		.req	r0
-	DST		.req	r1
-	SRC		.req	r2
-	VX		.req	r3
-	UNIT_X		.req	ip
-	TMP1		.req	r4
-	TMP2		.req	r5
-	VXMASK		.req	r6
-	PF_OFFS		.req	r7
-	SRC_WIDTH_FIXED	.req	r8
-
-	ldr	UNIT_X, [sp]
-	push	{r4, r5, r6, r7, r8, r10}
-	mvn	VXMASK, #((1 << bpp_shift) - 1)
-	ldr	SRC_WIDTH_FIXED, [sp, #28]
-
-	/* define helper macro */
-	.macro	scale_2_pixels
-		ldr&t	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
-		adds	VX, VX, UNIT_X
-		str&t	TMP1, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
-		bpl	9b
-
-		ldr&t	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
-		adds	VX, VX, UNIT_X
-		str&t	TMP2, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
-		bpl	9b
-	.endm
-
-	/* now do the scaling */
-	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
-	adds	VX, VX, UNIT_X
-9:	subpls	VX, VX, SRC_WIDTH_FIXED
-	bpl	9b
-	subs	W, W, #(8 + prefetch_braking_distance)
-	blt	2f
-	/* calculate prefetch offset */
-	mov	PF_OFFS, #prefetch_distance
-	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
-1:	/* main loop, process 8 pixels per iteration with prefetch */
-	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
-	add	PF_OFFS, UNIT_X, lsl #3
-	scale_2_pixels
-	scale_2_pixels
-	scale_2_pixels
-	scale_2_pixels
-	subs	W, W, #8
-	bge	1b
-2:
-	subs	W, W, #(4 - 8 - prefetch_braking_distance)
-	blt	2f
-1:	/* process the remaining pixels */
-	scale_2_pixels
-	scale_2_pixels
-	subs	W, W, #4
-	bge	1b
-2:
-	tst	W, #2
-	beq	2f
-	scale_2_pixels
-2:
-	tst	W, #1
-	ldrne&t	TMP1, [SRC, TMP1]
-	strne&t	TMP1, [DST]
-	/* cleanup helper macro */
-	.purgem	scale_2_pixels
-	.unreq	DST
-	.unreq	SRC
-	.unreq	W
-	.unreq	VX
-	.unreq	UNIT_X
-	.unreq	TMP1
-	.unreq	TMP2
-	.unreq	VXMASK
-	.unreq	PF_OFFS
-	.unreq  SRC_WIDTH_FIXED
-	/* return */
-	pop	{r4, r5, r6, r7, r8, r10}
-	bx	lr
-.endfunc
-.endm
-
-generate_nearest_scanline_func \
-    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
-
-generate_nearest_scanline_func \
-    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/pixman/pixman/pixman-arm-simd-asm.h b/pixman/pixman/pixman-arm-simd-asm.h
new file mode 100644
index 000000000..65436062b
--- /dev/null
+++ b/pixman/pixman/pixman-arm-simd-asm.h
@@ -0,0 +1,908 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/*
+ * Because the alignment of pixel data to cachelines, and even the number of
+ * cachelines per row can vary from row to row, and because of the need to
+ * preload each scanline once and only once, this prefetch strategy treats
+ * each row of pixels independently. When a pixel row is long enough, there
+ * are three distinct phases of prefetch:
+ * * an inner loop section, where each time a cacheline of data is
+ *    processed, another cacheline is preloaded (the exact distance ahead is
+ *    determined empirically using profiling results from lowlevel-blt-bench)
+ * * a leading section, where enough cachelines are preloaded to ensure no
+ *    cachelines escape being preloaded when the inner loop starts
+ * * a trailing section, where a limited number (0 or more) of cachelines
+ *    are preloaded to deal with data (if any) that hangs off the end of the
+ *    last iteration of the inner loop, plus any trailing bytes that were not
+ *    enough to make up one whole iteration of the inner loop
+ * 
+ * There are (in general) three distinct code paths, selected between
+ * depending upon how long the pixel row is. If it is long enough that there
+ * is at least one iteration of the inner loop (as described above) then
+ * this is described as the "wide" case. If it is shorter than that, but
+ * there are still enough bytes output that there is at least one 16-byte-
+ * long, 16-byte-aligned write to the destination (the optimum type of
+ * write), then this is the "medium" case. If it is not even this long, then
+ * this is the "narrow" case, and there is no attempt to align writes to
+ * 16-byte boundaries. In the "medium" and "narrow" cases, all the
+ * cachelines containing data from the pixel row are prefetched up-front.
+ */
+
+/*
+ * Determine whether we put the arguments on the stack for debugging.
+ */
+#undef DEBUG_PARAMS
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,         0
+.set FLAG_DST_READWRITE,         1
+.set FLAG_COND_EXEC,             0
+.set FLAG_BRANCH_OVER,           2
+.set FLAG_PROCESS_PRESERVES_PSR, 0
+.set FLAG_PROCESS_CORRUPTS_PSR,  4
+.set FLAG_PROCESS_DOESNT_STORE,  0
+.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
+.set FLAG_NO_SPILL_LINE_VARS,        0
+.set FLAG_SPILL_LINE_VARS_WIDE,      16
+.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
+.set FLAG_SPILL_LINE_VARS,           48
+.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
+.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+
+/*
+ * Offset into stack where mask and source pointer/stride can be accessed.
+ */
+#ifdef DEBUG_PARAMS
+.set ARGS_STACK_OFFSET,        (9*4+9*4)
+#else
+.set ARGS_STACK_OFFSET,        (9*4)
+#endif
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0
+.set PREFETCH_TYPE_STANDARD,   1
+
+/*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+ .if numbytes == 16
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+        op&r&cond    WK&reg2, [base], #4
+        op&r&cond    WK&reg3, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+  .endif
+ .elseif numbytes == 8
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1}
+  .endif
+ .elseif numbytes == 4
+        op&r&cond    WK&reg0, [base], #4
+ .elseif numbytes == 2
+        op&r&cond&h  WK&reg0, [base], #2
+ .elseif numbytes == 1
+        op&r&cond&b  WK&reg0, [base], #1
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+ .if numbytes == 16
+        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ .elseif numbytes == 8
+        stm&cond&db base, {WK&reg0,WK&reg1}
+ .elseif numbytes == 4
+        str&cond    WK&reg0, [base, #-4]
+ .elseif numbytes == 2
+        str&cond&h  WK&reg0, [base, #-2]
+ .elseif numbytes == 1
+        str&cond&b  WK&reg0, [base, #-1]
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixld cond, numbytes, firstreg, base, unaligned
+        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+.endm
+
+.macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .else
+        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .endif
+.endm
+
+.macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+        a x
+ .endif
+.endm
+
+
+.macro preload_leading_step1  bpp, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if bpp > 0
+        PF  bic,    ptr, base, #31
+  .set OFFSET, 0
+  .rept prefetch_distance+1
+        PF  pld,    [ptr, #OFFSET]
+   .set OFFSET, OFFSET+32
+  .endr
+ .endif
+.endm
+
+.macro preload_leading_step2  bpp, bpp_shift, ptr, base
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+ * are the bytes corresponding to the leading pixels more than the amount
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ *     inner_loop_offset = (src+leading_bytes)&31
+ *     extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+ .if bpp > 0
+  .ifc base,DST
+        /* The test can be simplified further when preloading the destination */
+        PF  tst,    base, #16
+        PF  beq,    61f
+  .else
+   .if bpp/dst_w_bpp == 4
+        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  and,    SCRATCH, SCRATCH, #31
+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
+        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
+        PF  bcs,    61f
+        PF  bpl,    60f
+        PF  pld,    [ptr, #32*(prefetch_distance+2)]
+   .else
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  bls,    61f
+   .endif
+  .endif
+60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
+61:
+ .endif
+.endm
+
+#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+.macro preload_middle   bpp, base, scratch_holds_offset
+ .if bpp > 0
+        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+   .if scratch_holds_offset
+        PF  pld,    [base, SCRATCH]
+   .else
+        PF  bic,    SCRATCH, base, #31
+        PF  pld,    [SCRATCH, #32*prefetch_distance]
+   .endif
+  .endif
+ .endif
+.endm
+
+.macro preload_trailing  bpp, bpp_shift, base
+ .if bpp > 0
+  .if bpp*pix_per_block > 256
+        /* Calculations are more complex if more than one fetch per block */
+        PF  and,    WK1, base, #31
+        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
+        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+        PF  bic,    SCRATCH, base, #31
+80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+        PF  add,    SCRATCH, SCRATCH, #32
+        PF  subs,   WK1, WK1, #32
+        PF  bhi,    80b
+  .else
+        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
+        PF  adceqs, SCRATCH, SCRATCH, #0
+        /* The instruction above has two effects: ensures Z is only
+         * set if C was clear (so Z indicates that both shifted quantities
+         * were 0), and clears C if Z was set (so C indicates that the sum
+         * of the shifted quantities was greater and not equal to 32) */
+        PF  beq,    82f
+        PF  bic,    SCRATCH, base, #31
+        PF  bcc,    81f
+        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
+81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+82:
+  .endif
+ .endif
+.endm
+
+
+.macro preload_line    narrow_case, bpp, bpp_shift, base
+/* "narrow_case" - just means that the macro was invoked from the "narrow"
+ *    code path rather than the "medium" one - because in the narrow case,
+ *    the row of pixels is known to output no more than 30 bytes, then
+ *    (assuming the source pixels are no wider than the the destination
+ *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ *    meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ *    destination) that's being preloaded, or 0 if this channel is not used
+ *    for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+ .if bpp > 0
+  .if narrow_case && (bpp <= dst_w_bpp)
+        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, LSL #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    90f
+        PF  pld,    [WK1]
+90:
+  .else
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, lsl #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    92f
+91:     PF  add,    WK0, WK0, #32
+        PF  cmp,    WK0, WK1
+        PF  pld,    [WK0]
+        PF  bne,    91b
+92:
+  .endif
+ .endif
+.endm
+
+
+.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+ .if decrementx
+        sub&cond X, X, #8*numbytes/dst_w_bpp
+ .endif
+        process_tail  cond, numbytes, firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   cond, numbytes, firstreg, DST
+ .endif
+.endm
+
+.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+  .ifc cond,mi
+        bpl     100f
+  .endif
+  .ifc cond,cs
+        bcc     100f
+  .endif
+  .ifc cond,ne
+        beq     100f
+  .endif
+        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+100:
+ .else
+        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .endif
+.endm
+
+.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+        /* Can't interleave reads and writes */
+        test
+        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+        test
+  .endif
+        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .else
+        /* Can interleave reads and writes for better scheduling */
+        test
+        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+  .if decrementx
+        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
+  .endif
+        process_tail  cond1, numbytes1, firstreg1
+        process_tail  cond2, numbytes2, firstreg2
+        pixst   cond1, numbytes1, firstreg1, DST
+        pixst   cond2, numbytes2, firstreg2, DST
+ .endif
+.endm
+
+
+.macro test_bits_1_0_ptr
+        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
+.endm
+
+.macro test_bits_3_2_ptr
+        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
+.endm
+
+.macro leading_15bytes  process_head, process_tail
+        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+        /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ .elseif dst_w_bpp == 16
+        test_bits_1_0_ptr
+        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
+ .endif
+        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
+.endm
+
+.macro test_bits_3_2_pix
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
+.endm
+
+.macro test_bits_1_0_pix
+ .if dst_w_bpp == 8
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+        movs    SCRATCH, X, lsr #1
+ .endif
+.endm
+
+.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ .if dst_w_bpp == 16
+        test_bits_1_0_pix
+        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ .endif
+.endm
+
+
+.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
+  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  src_bpp, SRC, 1
+  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  mask_bpp, MASK, 1
+  .else
+        preload_middle  src_bpp, SRC, 0
+        preload_middle  mask_bpp, MASK, 0
+  .endif
+  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+         * preloads for, to achieve staggered prefetches for multiple channels, because there are
+         * always two STMs per prefetch, so there is always an opposite STM on which to put the
+         * preload. Note, no need to BIC the base register here */
+        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
+  .endif
+        process_tail  , 16, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+  .endif
+  .set SUBBLOCK, SUBBLOCK+1
+ .endr
+        subs    X, X, #pix_per_block
+        bhs     110b
+.endm
+
+.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+        tst     DST, #16
+        bne     111f
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
+        b       112f
+111:
+ .endif
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
+112:
+        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+        PF  and,    WK0, X, #pix_per_block-1
+ .endif
+        preload_trailing  src_bpp, src_bpp_shift, SRC
+        preload_trailing  mask_bpp, mask_bpp_shift, MASK
+        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
+        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+        /* The remainder of the line is handled identically to the medium case */
+        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+.endm
+
+.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+120:
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
+        process_tail  , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+ .endif
+        subs    X, X, #128/dst_w_bpp
+        bhs     120b
+        /* Trailing pixels */
+        tst     X, #128/dst_w_bpp - 1
+        beq     exit_label
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+        tst     X, #16*8/dst_w_bpp
+        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+        /* Trailing pixels */
+        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+        tst     MASK, #3
+        bne     141f
+ .endif
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     140f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+140:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
+  .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+        b       exit_label
+141:
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     142f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+142:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
+  .endif
+ .endif
+.endm
+
+
+.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
+ .if vars_spilled
+        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+        /* This is ldmia sp,{} */
+        .word   0xE89D0000 | LINE_SAVED_REGS
+ .endif
+        subs    Y, Y, #1
+ .if vars_spilled
+  .if (LINE_SAVED_REGS) & (1<<1)
+        str     Y, [sp]
+  .endif
+ .endif
+        add     DST, DST, STRIDE_D
+ .if src_bpp > 0
+        add     SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+        add     MASK, MASK, STRIDE_M
+ .endif
+ .if restore_x
+        mov     X, ORIG_W
+ .endif
+        bhs     loop_label
+ .ifc "last_one",""
+  .if vars_spilled
+        b       197f
+  .else
+        b       198f
+  .endif
+ .else
+  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+        b       198f
+  .endif
+ .endif
+.endm
+
+
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags_, \
+                                   prefetch_distance_, \
+                                   init, \
+                                   newline, \
+                                   cleanup, \
+                                   process_head, \
+                                   process_tail, \
+                                   process_inner_loop
+
+ .func fname
+ .global fname
+ /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set flags, flags_
+ .set prefetch_distance, prefetch_distance_
+
+/*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+ .endif
+
+ .if src_bpp == 32
+  .set src_bpp_shift, 2
+ .elseif src_bpp == 24
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+  .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 0
+  .set src_bpp_shift, -1
+ .else
+  .error "requested src bpp (src_bpp) is not supported"
+ .endif
+
+ .if mask_bpp == 32
+  .set mask_bpp_shift, 2
+ .elseif mask_bpp == 24
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+  .set mask_bpp_shift, -1
+ .else
+  .error "requested mask bpp (mask_bpp) is not supported"
+ .endif
+
+ .if dst_w_bpp == 32
+  .set dst_bpp_shift, 2
+ .elseif dst_w_bpp == 24
+  .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+  .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+  .set dst_bpp_shift, 0
+ .else
+  .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+ .if (((flags) & FLAG_DST_READWRITE) != 0)
+  .set dst_r_bpp, dst_w_bpp
+ .else
+  .set dst_r_bpp, 0
+ .endif
+
+ .set pix_per_block, 16*8/dst_w_bpp
+ .if src_bpp != 0
+  .if 32*8/src_bpp > pix_per_block
+   .set pix_per_block, 32*8/src_bpp
+  .endif
+ .endif
+ .if mask_bpp != 0
+  .if 32*8/mask_bpp > pix_per_block
+   .set pix_per_block, 32*8/mask_bpp
+  .endif
+ .endif
+ .if dst_r_bpp != 0
+  .if 32*8/dst_r_bpp > pix_per_block
+   .set pix_per_block, 32*8/dst_r_bpp
+  .endif
+ .endif
+
+/* The standard entry conditions set up by pixman-arm-common.h are:
+ * r0 = width (pixels)
+ * r1 = height (rows)
+ * r2 = pointer to top-left pixel of destination
+ * r3 = destination stride (pixels)
+ * [sp] = source pixel value, or pointer to top-left pixel of source
+ * [sp,#4] = 0 or source stride (pixels)
+ * The following arguments are unused for non-mask operations
+ * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
+ * [sp,#12] = 0 or mask stride (pixels)
+ */
+
+/*
+ * Assign symbolic names to registers
+ */
+    X           .req    r0  /* pixels to go on this line */
+    Y           .req    r1  /* lines to go */
+    DST         .req    r2  /* destination pixel pointer */
+    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
+    SRC         .req    r4  /* source pixel pointer */
+    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
+    MASK        .req    r6  /* mask pixel pointer (if applicable) */
+    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
+    WK0         .req    r8  /* pixel data registers */
+    WK1         .req    r9
+    WK2         .req    r10
+    WK3         .req    r11
+    SCRATCH     .req    r12
+    ORIG_W      .req    r14 /* width (pixels) */
+
+fname:
+        push    {r4-r11, lr}        /* save all registers */
+
+        subs    Y, Y, #1
+        blo     199f
+
+#ifdef DEBUG_PARAMS
+        sub     sp, sp, #9*4
+#endif
+
+ .if src_bpp > 0
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
+ .endif
+ .if mask_bpp > 0
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
+ .endif
+        
+#ifdef DEBUG_PARAMS
+        add     Y, Y, #1
+        stmia   sp, {r0-r7,pc}
+        sub     Y, Y, #1
+#endif
+
+        init
+        
+        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
+        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
+ .if src_bpp > 0
+        lsl     STRIDE_S, #src_bpp_shift
+        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
+ .endif
+ .if mask_bpp > 0
+        lsl     STRIDE_M, #mask_bpp_shift
+        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
+ .endif
+ 
+        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
+        cmp     X, #2*16*8/dst_w_bpp - 1
+        blo     170f
+ .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
+        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
+        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
+        blo     160f
+
+        /* Wide case */
+        /* Adjust X so that the decrement instruction can also test for
+         * inner loop termination. We want it to stop when there are
+         * (prefetch_distance+1) complete blocks to go. */
+        sub     X, X, #(prefetch_distance+2)*pix_per_block
+        mov     ORIG_W, X
+  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+  .endif
+151:    /* New line */
+        newline
+        preload_leading_step1  src_bpp, WK1, SRC
+        preload_leading_step1  mask_bpp, WK2, MASK
+        preload_leading_step1  dst_r_bpp, WK3, DST
+        
+        tst     DST, #15
+        beq     154f
+        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
+        PF  and,    WK0, WK0, #15
+  .endif
+
+        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
+        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
+        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
+
+        leading_15bytes  process_head, process_tail
+        
+154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, SRC, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, MASK, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+ .endif
+ .ifc "process_inner_loop",""
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .else
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ .endif
+
+157:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .endif
+
+ .ltorg
+
+160:    /* Medium case */
+        mov     ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+ .endif
+161:    /* New line */
+        newline
+        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 0, mask_bpp, mask_bpp_shift, MASK
+        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+        
+        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
+        tst     DST, #15
+        beq     164f
+        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+        
+        leading_15bytes  process_head, process_tail
+        
+164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+        
+167:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if dst_w_bpp < 32
+        mov     ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+ .endif
+171:    /* New line */
+        newline
+        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 1, mask_bpp, mask_bpp_shift, MASK
+        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+        
+ .if dst_w_bpp == 8
+        tst     DST, #3
+        beq     174f
+172:    subs    X, X, #1
+        blo     177f
+        process_head  , 1, 0, 1, 1, 0
+        process_tail  , 1, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 1, 0, DST
+  .endif
+        tst     DST, #3
+        bne     172b
+ .elseif dst_w_bpp == 16
+        tst     DST, #2
+        beq     174f
+        subs    X, X, #1
+        blo     177f
+        process_head  , 2, 0, 1, 1, 0
+        process_tail  , 2, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 2, 0, DST
+  .endif
+ .endif
+
+174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+
+177:    /* Check for another line */
+        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+
+197:
+ .if (flags) & FLAG_SPILL_LINE_VARS
+        add     sp, sp, #LINE_SAVED_REG_COUNT*4
+ .endif
+198:
+        cleanup
+
+#ifdef DEBUG_PARAMS
+        add     sp, sp, #9*4 /* junk the debug copy of arguments */
+#endif
+199:
+        pop     {r4-r11, pc}  /* exit */
+
+ .ltorg
+
+    .unreq  X
+    .unreq  Y
+    .unreq  DST
+    .unreq  STRIDE_D
+    .unreq  SRC
+    .unreq  STRIDE_S
+    .unreq  MASK
+    .unreq  STRIDE_M
+    .unreq  WK0
+    .unreq  WK1
+    .unreq  WK2
+    .unreq  WK3
+    .unreq  SCRATCH
+    .unreq  ORIG_W
+    .endfunc
+.endm
+
+.macro line_saved_regs  x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+ .irp SAVED_REG,x
+  .ifc "SAVED_REG","Y"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_D"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_S"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_M"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","ORIG_W"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+ .endr
+.endm
+
+.macro nop_macro x:vararg
+.endm
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
index 94f9a0caf..af062e19d 100644
--- a/pixman/pixman/pixman-arm-simd.c
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -31,369 +31,191 @@
 #include "pixman-arm-common.h"
 #include "pixman-inlines.h"
 
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8_8_asm_armv6 (int32_t  width,
-				    int32_t  height,
-				    uint8_t *dst_line,
-				    int32_t  dst_stride,
-				    uint8_t *src_line,
-				    int32_t  src_stride)
-{
-    uint8_t *dst, *src;
-    int32_t w;
-    uint8_t s, d;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	/* ensure both src and dst are properly aligned before doing 32 bit reads
-	 * we'll stay in this loop if src and dst have differing alignments
-	 */
-	while (w && (((uintptr_t)dst & 3) || ((uintptr_t)src & 3)))
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
-
-	    dst++;
-	    src++;
-	    w--;
-	}
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
+		                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
 
-	while (w >= 4)
-	{
-	    asm ("uqadd8 %0, %1, %2"
-		 : "=r" (*(uint32_t*)dst)
-		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
-	    dst += 4;
-	    src += 4;
-	    w -= 4;
-	}
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
 
-	while (w)
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
 
-	    dst++;
-	    src++;
-	    w--;
-	}
-    }
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
 
-}
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
 
 void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
-                                           int32_t   height,
-                                           uint32_t *dst_line,
-                                           int32_t   dst_stride,
-                                           uint32_t *src_line,
-                                           int32_t   src_stride)
-{
-    uint32_t    *dst;
-    uint32_t    *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t upper_component_mask = 0xff00ff00;
-    uint32_t alpha_mask = 0xff;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
+pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint32_t *dst,
+                                       int32_t   dst_stride,
+                                       uint32_t  src);
 
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    "ldr r4, [%[dest]] \n\t"
-
-#else
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* multiply by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
-	    "and r7, r7, %[upper_component_mask]\n\t"
-	    "uxtab16 r6, r7, r6, ror #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-	    );
-    }
-}
+void
+pixman_composite_src_n_0565_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint16_t *dst,
+                                       int32_t   dst_stride,
+                                       uint16_t  src);
 
 void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
-                                             int32_t   height,
-                                             uint32_t *dst_line,
-                                             int32_t   dst_stride,
-                                             uint32_t *src_line,
-                                             int32_t   src_stride,
-                                             uint32_t  mask)
+pixman_composite_src_n_8_asm_armv6 (int32_t   w,
+                                    int32_t   h,
+                                    uint8_t  *dst,
+                                    int32_t   dst_stride,
+                                    uint8_t  src);
+
+static pixman_bool_t
+arm_simd_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride, /* in 32-bit words */
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 _xor)
 {
-    uint32_t *dst;
-    uint32_t *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t alpha_mask = 0xff;
-
-    mask = (mask) >> 24;
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
 
-    while (height--)
+    switch (bpp)
     {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    "uxtb16 r6, r5\n\t"
-	    "uxtb16 r7, r5, ror #8\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
-	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
-	    );
+    case 8:
+	pixman_composite_src_n_8_asm_armv6 (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_armv6 (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_armv6 (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
     }
 }
 
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
-                                          int32_t   height,
-                                          uint32_t *dst_line,
-                                          int32_t   dst_stride,
-                                          uint32_t  src,
-                                          int32_t   unused,
-                                          uint8_t  *mask_line,
-                                          int32_t   mask_stride)
+static pixman_bool_t
+arm_simd_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride, /* in 32-bit words */
+              int                      dst_stride, /* in 32-bit words */
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
 {
-    uint32_t  srca;
-    uint32_t *dst;
-    uint8_t  *mask;
-    int32_t w;
-
-    srca = src >> 24;
-
-    uint32_t component_mask = 0xff00ff;
-    uint32_t component_half = 0x800080;
-
-    uint32_t src_hi = (src >> 8) & component_mask;
-    uint32_t src_lo = src & component_mask;
+    if (src_bpp != dst_bpp)
+	return FALSE;
 
-    while (height--)
+    switch (src_bpp)
     {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load mask */
-	    "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
-	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* we could simplify this to use 'sub' if we were
-	     * willing to give up a register for alpha_mask
-	     */
-	    "mvn r8, r5\n\t"
-	    "mov r8, r8, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
-	    : [component_half] "r" (component_half),
-	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    case 8:
+        pixman_composite_src_8_8_asm_armv6 (
+                width, height,
+                (uint8_t *)(((char *) dst_bits) +
+                dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4,
+                (uint8_t *)(((char *) src_bits) +
+                src_y * src_stride * 4 + src_x * 1), src_stride * 4);
+        return TRUE;
+    case 16:
+	pixman_composite_src_0565_0565_asm_armv6 (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_armv6 (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
     }
 }
 
-#endif
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
-                                   uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
-                                     uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
-                                      uint8_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
-                                        uint16_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
-                                        uint32_t, uint32_t)
-
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888),
+
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
@@ -428,5 +250,8 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 
+    imp->blt = arm_simd_blt;
+    imp->fill = arm_simd_fill;
+
     return imp;
 }
diff --git a/pixman/pixman/pixman-combine-float.c b/pixman/pixman/pixman-combine-float.c
index c916df8a3..06ce2037e 100644
--- a/pixman/pixman/pixman-combine-float.c
+++ b/pixman/pixman/pixman-combine-float.c
@@ -653,10 +653,12 @@ clip_color (rgb_t *color, float a)
     float l = get_lum (color);
     float n = channel_min (color);
     float x = channel_max (color);
+    float t;
 
     if (n < 0.0f)
     {
-	if ((l - n) < 4 * FLT_EPSILON)
+	t = l - n;
+	if (IS_ZERO (t))
 	{
 	    color->r = 0.0f;
 	    color->g = 0.0f;
@@ -664,14 +666,15 @@ clip_color (rgb_t *color, float a)
 	}
 	else
 	{
-	    color->r = l + (((color->r - l) * l) / (l - n));
-	    color->g = l + (((color->g - l) * l) / (l - n));
-	    color->b = l + (((color->b - l) * l) / (l - n));
+	    color->r = l + (((color->r - l) * l) / t);
+	    color->g = l + (((color->g - l) * l) / t);
+	    color->b = l + (((color->b - l) * l) / t);
 	}
     }
     if (x > a)
     {
-	if ((x - l) < 4 * FLT_EPSILON)
+	t = x - l;
+	if (IS_ZERO (t))
 	{
 	    color->r = a;
 	    color->g = a;
@@ -679,9 +682,9 @@ clip_color (rgb_t *color, float a)
 	}
 	else
 	{
-	    color->r = l + (((color->r - l) * (a - l) / (x - l)));
-	    color->g = l + (((color->g - l) * (a - l) / (x - l)));
-	    color->b = l + (((color->b - l) * (a - l) / (x - l)));
+	    color->r = l + (((color->r - l) * (a - l) / t));
+	    color->g = l + (((color->g - l) * (a - l) / t));
+	    color->b = l + (((color->b - l) * (a - l) / t));
 	}
     }
 }
@@ -702,6 +705,7 @@ static void
 set_sat (rgb_t *src, float sat)
 {
     float *max, *mid, *min;
+    float t;
 
     if (src->r > src->g)
     {
@@ -752,14 +756,16 @@ set_sat (rgb_t *src, float sat)
 	}
     }
 
-    if (*max > *min)
+    t = *max - *min;
+
+    if (IS_ZERO (t))
     {
-	*mid = (((*mid - *min) * sat) / (*max - *min));
-	*max = sat;
+	*mid = *max = 0.0f;
     }
     else
     {
-	*mid = *max = 0.0f;
+	*mid = ((*mid - *min) * sat) / t;
+	*max = sat;
     }
 
     *min = 0.0f;
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index 7336fa0d5..8e80b4280 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -33,6 +33,14 @@
 #define L1CACHE_SIZE (8 * 1024)
 #define L2CACHE_SIZE (128 * 1024)
 
+/* This is applied to both L1 and L2 tests - alternatively, you could
+ * parameterise bench_L or split it into two functions. It could be
+ * read at runtime on some architectures, but it only really matters
+ * that it's a number that's an integer divisor of both cacheline
+ * lengths, and further, it only really matters for caches that don't
+ * do allocate0on-write. */
+#define CACHELINE_LENGTH (32) /* bytes */
+
 #define WIDTH  1920
 #define HEIGHT 1080
 #define BUFSIZE (WIDTH * HEIGHT * 4)
@@ -168,18 +176,29 @@ bench_L  (pixman_op_t              op,
           int                      width,
           int                      lines_count)
 {
-    int64_t      i, j;
+    int64_t      i, j, k;
     int          x = 0;
     int          q = 0;
     volatile int qx;
 
     for (i = 0; i < n; i++)
     {
-	/* touch destination buffer to fetch it into L1 cache */
-	for (j = 0; j < width + 64; j += 16) {
-	    q += dst[j];
-	    q += src[j];
-	}
+        /* For caches without allocate-on-write, we need to force the
+         * destination buffer back into the cache on each iteration,
+         * otherwise if they are evicted during the test, they remain
+         * uncached. This doesn't matter for tests which read the
+         * destination buffer, or for caches that do allocate-on-write,
+         * but in those cases this loop just adds constant time, which
+         * should be successfully cancelled out.
+         */
+        for (j = 0; j < lines_count; j++)
+        {
+            for (k = 0; k < width + 62; k += CACHELINE_LENGTH / sizeof *dst)
+            {
+                q += dst[j * WIDTH + k];
+            }
+            q += dst[j * WIDTH + width + 62];
+        }
 	if (++x >= 64)
 	    x = 0;
 	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
diff --git a/pixman/test/stress-test.c b/pixman/test/stress-test.c
index 9d949afad..1f03c7543 100644
--- a/pixman/test/stress-test.c
+++ b/pixman/test/stress-test.c
@@ -205,13 +205,21 @@ rand_y (pixman_image_t *image)
 	return log_rand ();
 }
 
+typedef enum
+{
+    DONT_CARE,
+    PREFER_ALPHA,
+    REQUIRE_ALPHA
+} alpha_preference_t;
+
 static pixman_format_code_t
-random_format (pixman_bool_t prefer_alpha)
+random_format (alpha_preference_t alpha)
 {
     pixman_format_code_t format;
     int n = prng_rand_n (ARRAY_LENGTH (image_formats));
 
-    if (prefer_alpha && prng_rand_n (4) != 0)
+    if (alpha >= PREFER_ALPHA &&
+	(alpha == REQUIRE_ALPHA || prng_rand_n (4) != 0))
     {
         do
         {
@@ -227,7 +235,7 @@ random_format (pixman_bool_t prefer_alpha)
 }
 
 static pixman_image_t *
-create_random_bits_image (pixman_bool_t prefer_alpha)
+create_random_bits_image (alpha_preference_t alpha_preference)
 {
     pixman_format_code_t format;
     pixman_indexed_t *indexed;
@@ -241,7 +249,7 @@ create_random_bits_image (pixman_bool_t prefer_alpha)
     int n_coefficients = 0;
 
     /* format */
-    format = random_format (prefer_alpha);
+    format = random_format (alpha_preference);
 
     indexed = NULL;
     if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
@@ -410,7 +418,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
 	pixman_image_t *alpha_map;
 	int16_t x, y;
 
-	alpha_map = create_random_bits_image (FALSE);
+	alpha_map = create_random_bits_image (DONT_CARE);
 
 	if (alpha_map)
 	{
@@ -493,7 +501,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
 		    width = INT32_MAX - x;
 		if (height + y < y)
 		    height = INT32_MAX - y;
-		
+
 		pixman_region32_union_rect (
 		    &region, &region, x, y, width, height);
 	    }
@@ -716,7 +724,7 @@ create_random_image (void)
     {
     default:
     case 0:
-	result = create_random_bits_image (FALSE);
+	result = create_random_bits_image (DONT_CARE);
 	break;
 
     case 1:
@@ -848,62 +856,66 @@ run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
     }
 
     source = mask = dest = NULL;
-    
+
     prng_srand (seed);
 
     if (prng_rand_n (8) == 0)
     {
         int n_traps;
         pixman_trapezoid_t *trapezoids;
+	int p = prng_rand_n (3);
 
-        dest = create_random_bits_image (TRUE);
+	if (p == 0)
+	    dest = create_random_bits_image (DONT_CARE);
+	else
+	    dest = create_random_bits_image (REQUIRE_ALPHA);
 
-        if (dest)
-        {
-            set_general_properties (dest, TRUE);
+	if (!dest)
+	    goto out;
+
+	set_general_properties (dest, TRUE);
+
+	if (!(trapezoids = create_random_trapezoids (
+		  &n_traps, dest->bits.width, dest->bits.height)))
+	{
+	    goto out;
+	}
 
-            trapezoids = create_random_trapezoids (
-                &n_traps, dest->bits.width, dest->bits.height);
-
-            if (trapezoids)
-            {
-                switch (prng_rand_n (3))
-                {
-                case 0:
-                    pixman_rasterize_trapezoid (
-                        dest, &trapezoids[prng_rand_n (n_traps)],
-                        rand_x (dest), rand_y (dest));
-                    break;
-
-                case 1:
-                    source = create_random_image ();
-
-                    if (source)
-                    {
-                        op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))];
-                        
-                        pixman_composite_trapezoids (
-                            op, source, dest,
-                            random_format (TRUE),
-                            rand_x (source), rand_y (source),
-                            rand_x (dest), rand_y (dest),
-                            n_traps, trapezoids);
-                    }
-                    break;
-
-                case 2:
-                    pixman_add_trapezoids (
-                        dest, rand_x (dest), rand_y (dest), n_traps, trapezoids);
-                    break;
-                }
-
-                free (trapezoids);
-            }
+	switch (p)
+	{
+	case 0:
+	    source = create_random_image ();
+
+	    if (source)
+	    {
+		op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))];
+
+		pixman_composite_trapezoids (
+		    op, source, dest,
+		    random_format (REQUIRE_ALPHA),
+		    rand_x (source), rand_y (source),
+		    rand_x (dest), rand_y (dest),
+		    n_traps, trapezoids);
+	    }
+	    break;
+
+	case 1:
+	    pixman_rasterize_trapezoid (
+		dest, &trapezoids[prng_rand_n (n_traps)],
+		rand_x (dest), rand_y (dest));
+	    break;
+
+	case 2:
+	    pixman_add_trapezoids (
+		dest, rand_x (dest), rand_y (dest), n_traps, trapezoids);
+	    break;
         }
+
+	free (trapezoids);
     }
     else
     {
-        dest = create_random_bits_image (FALSE);
+        dest = create_random_bits_image (DONT_CARE);
         source = create_random_image ();
         mask = create_random_image ();
 
@@ -917,16 +929,17 @@ run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
                                       source, mask, dest,
                                       rand_x (source), rand_y (source),
                                       rand_x (mask), rand_y (mask),
-                                      0, 0, 
+                                      0, 0,
                                       dest->bits.width,
                                       dest->bits.height);
         }
     }
 
-        if (source)
-            pixman_image_unref (source);
-        if (mask)
-            pixman_image_unref (mask);
+out:
+    if (source)
+	pixman_image_unref (source);
+    if (mask)
+	pixman_image_unref (mask);
     if (dest)
 	pixman_image_unref (dest);
 }