aboutsummaryrefslogtreecommitdiff
path: root/pixman/pixman/pixman-arm-simd-asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'pixman/pixman/pixman-arm-simd-asm.S')
-rw-r--r--pixman/pixman/pixman-arm-simd-asm.S877
1 files changed, 439 insertions, 438 deletions
diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S
index e00836b6c..8fe1b5038 100644
--- a/pixman/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman/pixman-arm-simd-asm.S
@@ -1,438 +1,439 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- * Copyright © 2010 Nokia Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-
-/* Prevent the stack from becoming executable */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
- .text
- .arch armv6
- .object_arch armv4
- .arm
- .altmacro
-
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
- .func fname
- .global fname
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
-.endm
-
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- mov r10, r1
- sub sp, sp, #4
- subs r10, r10, #1
- mov r11, r0
- mov r8, r2
- str r3, [sp]
- ldr r7, [sp, #36]
- bcc 0f
-6: cmp r11, #0
- beq 1f
- orr r3, r8, r7
- tst r3, #3
- beq 2f
- mov r1, r8
- mov r0, r7
- mov r12, r11
- b 3f
-5: tst r3, #3
- beq 4f
-3: ldrb r2, [r0], #1
- subs r12, r12, #1
- ldrb r3, [r1]
- uqadd8 r3, r2, r3
- strb r3, [r1], #1
- orr r3, r1, r0
- bne 5b
-1: ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
-10: subs r10, r10, #1
- bcs 6b
-0: add sp, sp, #4
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-2: mov r12, r11
- mov r1, r8
- mov r0, r7
-4: cmp r12, #3
- subgt r6, r12, #4
- movgt r9, r12
- lsrgt r5, r6, #2
- addgt r3, r5, #1
- movgt r12, #0
- lslgt r4, r3, #2
- ble 7f
-8: ldr r3, [r0, r12]
- ldr r2, [r1, r12]
- uqadd8 r3, r3, r2
- str r3, [r1, r12]
- add r12, r12, #4
- cmp r12, r4
- bne 8b
- sub r3, r9, #4
- bic r3, r3, #3
- add r3, r3, #4
- subs r12, r6, r5, lsl #2
- add r1, r1, r3
- add r0, r0, r3
- beq 1b
-7: mov r4, #0
-9: ldrb r3, [r1, r4]
- ldrb r2, [r0, r4]
- uqadd8 r3, r2, r3
- strb r3, [r1, r4]
- add r4, r4, #1
- cmp r4, r12
- bne 9b
- ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
- b 10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #20
- cmp r1, #0
- mov r12, r2
- str r1, [sp, #12]
- str r0, [sp, #16]
- ldr r2, [sp, #52]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp]
- ldr r3, [sp, #56]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-6: ldr r11, [sp, #8]
-1: ldr r9, [sp]
- mov r0, r12
- add r12, r12, r9
- mov r1, r2
- str r12, [sp, #4]
- add r2, r2, r11
- ldr r12, [sp, #16]
- ldr r3, =0x00800080
- ldr r9, =0xff00ff00
- mov r11, #255
- cmp r12, #0
- beq 4f
-5: ldr r5, [r1], #4
- ldr r4, [r0]
- sub r8, r11, r5, lsr #24
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mla r6, r6, r8, r3
- mla r7, r7, r8, r3
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- and r7, r7, r9
- uxtab16 r6, r7, r6, ror #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 5b
-4: ldr r3, [sp, #12]
- add r10, r10, #1
- cmp r10, r3
- ldr r12, [sp, #4]
- bne 6b
-0: add sp, sp, #20
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- str r1, [sp, #12]
- ldrb r1, [sp, #71]
- mov r12, r2
- str r0, [sp, #16]
- ldr r2, [sp, #60]
- str r1, [sp, #24]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #20]
- ldr r3, [sp, #64]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-5: ldr r11, [sp, #8]
-1: ldr r4, [sp, #20]
- mov r0, r12
- mov r1, r2
- add r12, r12, r4
- add r2, r2, r11
- str r12, [sp]
- str r2, [sp, #4]
- ldr r12, [sp, #16]
- ldr r2, =0x00800080
- ldr r3, [sp, #24]
- mov r11, #255
- cmp r12, #0
- beq 3f
-4: ldr r5, [r1], #4
- ldr r4, [r0]
- uxtb16 r6, r5
- uxtb16 r7, r5, ror #8
- mla r6, r6, r3, r2
- mla r7, r7, r3, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- sub r8, r11, r5, lsr #24
- mla r6, r6, r8, r2
- mla r7, r7, r8, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r1, [sp, #12]
- add r10, r10, #1
- cmp r10, r1
- ldr r12, [sp]
- ldr r2, [sp, #4]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- ldr r9, [sp, #60]
- str r1, [sp, #12]
- bic r1, r9, #-16777216
- str r1, [sp, #20]
- mov r12, r2
- lsr r1, r9, #8
- ldr r2, [sp, #20]
- bic r1, r1, #-16777216
- bic r2, r2, #65280
- bic r1, r1, #65280
- str r2, [sp, #20]
- str r0, [sp, #16]
- str r1, [sp, #4]
- ldr r2, [sp, #68]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #24]
- mov r0, #0
- b 1f
-5: ldr r3, [sp, #24]
-1: ldr r4, [sp, #72]
- mov r10, r12
- mov r1, r2
- add r12, r12, r3
- add r2, r2, r4
- str r12, [sp, #8]
- str r2, [sp]
- ldr r12, [sp, #16]
- ldr r11, =0x00800080
- ldr r2, [sp, #4]
- ldr r3, [sp, #20]
- cmp r12, #0
- beq 3f
-4: ldrb r5, [r1], #1
- ldr r4, [r10]
- mla r6, r3, r5, r11
- mla r7, r2, r5, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mvn r8, r5
- lsr r8, r8, #24
- mla r6, r6, r8, r11
- mla r7, r7, r8, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r10], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r4, [sp, #12]
- add r0, r0, #1
- cmp r0, r4
- ldr r12, [sp, #8]
- ldr r2, [sp]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-/*
- * Note: This code is only using armv5te instructions (not even armv6),
- * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
- * be split into a few variants, tuned for each microarchitecture.
- *
- * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
- * have efficient write combining), it needs to be changed to use 16-byte
- * aligned writes using STM instruction.
- *
- * Nearest scanline scaler macro template uses the following arguments:
- * fname - name of the function to generate
- * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
- * t - type suffix for LDR/STR instructions
- * prefetch_distance - prefetch in the source image by that many
- * pixels ahead
- * prefetch_braking_distance - stop prefetching when that many pixels are
- * remaining before the end of scanline
- */
-
-.macro generate_nearest_scanline_func fname, bpp_shift, t, \
- prefetch_distance, \
- prefetch_braking_distance
-
-pixman_asm_function fname
- W .req r0
- DST .req r1
- SRC .req r2
- VX .req r3
- UNIT_X .req ip
- TMP1 .req r4
- TMP2 .req r5
- VXMASK .req r6
- PF_OFFS .req r7
-
- ldr UNIT_X, [sp]
- push {r4, r5, r6, r7}
- mvn VXMASK, #((1 << bpp_shift) - 1)
-
- /* define helper macro */
- .macro scale_2_pixels
- ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
- str&t TMP1, [DST], #(1 << bpp_shift)
-
- ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
- str&t TMP2, [DST], #(1 << bpp_shift)
- .endm
-
- /* now do the scaling */
- and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
- subs W, W, #(8 + prefetch_braking_distance)
- blt 2f
- /* calculate prefetch offset */
- mov PF_OFFS, #prefetch_distance
- mla PF_OFFS, UNIT_X, PF_OFFS, VX
-1: /* main loop, process 8 pixels per iteration with prefetch */
- subs W, W, #8
- add PF_OFFS, UNIT_X, lsl #3
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
- bge 1b
-2:
- subs W, W, #(4 - 8 - prefetch_braking_distance)
- blt 2f
-1: /* process the remaining pixels */
- scale_2_pixels
- scale_2_pixels
- subs W, W, #4
- bge 1b
-2:
- tst W, #2
- beq 2f
- scale_2_pixels
-2:
- tst W, #1
- ldrne&t TMP1, [SRC, TMP1]
- strne&t TMP1, [DST]
- /* cleanup helper macro */
- .purgem scale_2_pixels
- .unreq DST
- .unreq SRC
- .unreq W
- .unreq VX
- .unreq UNIT_X
- .unreq TMP1
- .unreq TMP2
- .unreq VXMASK
- .unreq PF_OFFS
- /* return */
- pop {r4, r5, r6, r7}
- bx lr
-.endfunc
-.endm
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ mov r10, r1
+ sub sp, sp, #4
+ subs r10, r10, #1
+ mov r11, r0
+ mov r8, r2
+ str r3, [sp]
+ ldr r7, [sp, #36]
+ bcc 0f
+6: cmp r11, #0
+ beq 1f
+ orr r3, r8, r7
+ tst r3, #3
+ beq 2f
+ mov r1, r8
+ mov r0, r7
+ mov r12, r11
+ b 3f
+5: tst r3, #3
+ beq 4f
+3: ldrb r2, [r0], #1
+ subs r12, r12, #1
+ ldrb r3, [r1]
+ uqadd8 r3, r2, r3
+ strb r3, [r1], #1
+ orr r3, r1, r0
+ bne 5b
+1: ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+10: subs r10, r10, #1
+ bcs 6b
+0: add sp, sp, #4
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+2: mov r12, r11
+ mov r1, r8
+ mov r0, r7
+4: cmp r12, #3
+ subgt r6, r12, #4
+ movgt r9, r12
+ lsrgt r5, r6, #2
+ addgt r3, r5, #1
+ movgt r12, #0
+ lslgt r4, r3, #2
+ ble 7f
+8: ldr r3, [r0, r12]
+ ldr r2, [r1, r12]
+ uqadd8 r3, r3, r2
+ str r3, [r1, r12]
+ add r12, r12, #4
+ cmp r12, r4
+ bne 8b
+ sub r3, r9, #4
+ bic r3, r3, #3
+ add r3, r3, #4
+ subs r12, r6, r5, lsl #2
+ add r1, r1, r3
+ add r0, r0, r3
+ beq 1b
+7: mov r4, #0
+9: ldrb r3, [r1, r4]
+ ldrb r2, [r0, r4]
+ uqadd8 r3, r2, r3
+ strb r3, [r1, r4]
+ add r4, r4, #1
+ cmp r4, r12
+ bne 9b
+ ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+ b 10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #20
+ cmp r1, #0
+ mov r12, r2
+ str r1, [sp, #12]
+ str r0, [sp, #16]
+ ldr r2, [sp, #52]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp]
+ ldr r3, [sp, #56]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+6: ldr r11, [sp, #8]
+1: ldr r9, [sp]
+ mov r0, r12
+ add r12, r12, r9
+ mov r1, r2
+ str r12, [sp, #4]
+ add r2, r2, r11
+ ldr r12, [sp, #16]
+ ldr r3, =0x00800080
+ ldr r9, =0xff00ff00
+ mov r11, #255
+ cmp r12, #0
+ beq 4f
+5: ldr r5, [r1], #4
+ ldr r4, [r0]
+ sub r8, r11, r5, lsr #24
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mla r6, r6, r8, r3
+ mla r7, r7, r8, r3
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ and r7, r7, r9
+ uxtab16 r6, r7, r6, ror #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 5b
+4: ldr r3, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r3
+ ldr r12, [sp, #4]
+ bne 6b
+0: add sp, sp, #20
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ str r1, [sp, #12]
+ ldrb r1, [sp, #71]
+ mov r12, r2
+ str r0, [sp, #16]
+ ldr r2, [sp, #60]
+ str r1, [sp, #24]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #20]
+ ldr r3, [sp, #64]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+5: ldr r11, [sp, #8]
+1: ldr r4, [sp, #20]
+ mov r0, r12
+ mov r1, r2
+ add r12, r12, r4
+ add r2, r2, r11
+ str r12, [sp]
+ str r2, [sp, #4]
+ ldr r12, [sp, #16]
+ ldr r2, =0x00800080
+ ldr r3, [sp, #24]
+ mov r11, #255
+ cmp r12, #0
+ beq 3f
+4: ldr r5, [r1], #4
+ ldr r4, [r0]
+ uxtb16 r6, r5
+ uxtb16 r7, r5, ror #8
+ mla r6, r6, r3, r2
+ mla r7, r7, r3, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ sub r8, r11, r5, lsr #24
+ mla r6, r6, r8, r2
+ mla r7, r7, r8, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r1, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r1
+ ldr r12, [sp]
+ ldr r2, [sp, #4]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ ldr r9, [sp, #60]
+ str r1, [sp, #12]
+ bic r1, r9, #-16777216
+ str r1, [sp, #20]
+ mov r12, r2
+ lsr r1, r9, #8
+ ldr r2, [sp, #20]
+ bic r1, r1, #-16777216
+ bic r2, r2, #65280
+ bic r1, r1, #65280
+ str r2, [sp, #20]
+ str r0, [sp, #16]
+ str r1, [sp, #4]
+ ldr r2, [sp, #68]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #24]
+ mov r0, #0
+ b 1f
+5: ldr r3, [sp, #24]
+1: ldr r4, [sp, #72]
+ mov r10, r12
+ mov r1, r2
+ add r12, r12, r3
+ add r2, r2, r4
+ str r12, [sp, #8]
+ str r2, [sp]
+ ldr r12, [sp, #16]
+ ldr r11, =0x00800080
+ ldr r2, [sp, #4]
+ ldr r3, [sp, #20]
+ cmp r12, #0
+ beq 3f
+4: ldrb r5, [r1], #1
+ ldr r4, [r10]
+ mla r6, r3, r5, r11
+ mla r7, r2, r5, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mvn r8, r5
+ lsr r8, r8, #24
+ mla r6, r6, r8, r11
+ mla r7, r7, r8, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r10], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r4, [sp, #12]
+ add r0, r0, #1
+ cmp r0, r4
+ ldr r12, [sp, #8]
+ ldr r2, [sp]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ * be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
+ W .req r0
+ DST .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ VXMASK .req r6
+ PF_OFFS .req r7
+
+ ldr UNIT_X, [sp]
+ push {r4, r5, r6, r7}
+ mvn VXMASK, #((1 << bpp_shift) - 1)
+
+ /* define helper macro */
+ .macro scale_2_pixels
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
+ add VX, VX, UNIT_X
+ str&t TMP1, [DST], #(1 << bpp_shift)
+
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+ add VX, VX, UNIT_X
+ str&t TMP2, [DST], #(1 << bpp_shift)
+ .endm
+
+ /* now do the scaling */
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+ add VX, VX, UNIT_X
+ subs W, W, #(8 + prefetch_braking_distance)
+ blt 2f
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
+1: /* main loop, process 8 pixels per iteration with prefetch */
+ subs W, W, #8
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+ bge 1b
+2:
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
+ blt 2f
+1: /* process the remaining pixels */
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #4
+ bge 1b
+2:
+ tst W, #2
+ beq 2f
+ scale_2_pixels
+2:
+ tst W, #1
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
+ /* cleanup helper macro */
+ .purgem scale_2_pixels
+ .unreq DST
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq VXMASK
+ .unreq PF_OFFS
+ /* return */
+ pop {r4, r5, r6, r7}
+ bx lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32