From f6ba83c963bf48dfa349ab8c54d1968c4c69f7b8 Mon Sep 17 00:00:00 2001 From: marha Date: Tue, 28 Sep 2010 08:37:27 +0000 Subject: xkeyboard-config pixman git update 28/9/2010 --- pixman/pixman/pixman-arm-neon-asm.S | 22 +- pixman/pixman/pixman-arm-neon.c | 4 +- pixman/pixman/pixman-arm-simd-asm.S | 660 ++++++++++++++-------------- pixman/pixman/pixman-arm-simd.c | 834 ++++++++++++++++++------------------ pixman/pixman/pixman-fast-path.c | 28 +- pixman/pixman/pixman-mmx.c | 28 +- pixman/pixman/pixman-sse2.c | 689 +---------------------------- pixman/test/lowlevel-blt-bench.c | 6 +- 8 files changed, 806 insertions(+), 1465 deletions(-) (limited to 'pixman') diff --git a/pixman/pixman/pixman-arm-neon-asm.S b/pixman/pixman/pixman-arm-neon-asm.S index fe128aa94..108abacd1 100644 --- a/pixman/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman/pixman-arm-neon-asm.S @@ -495,15 +495,15 @@ generate_composite_function \ /******************************************************************************/ -.macro pixman_composite_add_8000_8000_process_pixblock_head +.macro pixman_composite_add_8_8_process_pixblock_head vqadd.u8 q14, q0, q2 vqadd.u8 q15, q1, q3 .endm -.macro pixman_composite_add_8000_8000_process_pixblock_tail +.macro pixman_composite_add_8_8_process_pixblock_tail .endm -.macro pixman_composite_add_8000_8000_process_pixblock_tail_head +.macro pixman_composite_add_8_8_process_pixblock_tail_head vld1.8 {d0, d1, d2, d3}, [SRC]! PF add PF_X, PF_X, #32 PF tst PF_CTL, #0xF @@ -523,15 +523,15 @@ generate_composite_function \ .endm generate_composite_function \ - pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ FLAG_DST_READWRITE, \ 32, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ default_init, \ default_cleanup, \ - pixman_composite_add_8000_8000_process_pixblock_head, \ - pixman_composite_add_8000_8000_process_pixblock_tail, \ - pixman_composite_add_8000_8000_process_pixblock_tail_head + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head /******************************************************************************/ @@ -561,8 +561,8 @@ generate_composite_function \ 10, /* prefetch distance */ \ default_init, \ default_cleanup, \ - pixman_composite_add_8000_8000_process_pixblock_head, \ - pixman_composite_add_8000_8000_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ pixman_composite_add_8888_8888_process_pixblock_tail_head generate_composite_function_single_scanline \ @@ -571,8 +571,8 @@ generate_composite_function_single_scanline \ 8, /* number of pixels, processed in a single block */ \ default_init, \ default_cleanup, \ - pixman_composite_add_8000_8000_process_pixblock_head, \ - pixman_composite_add_8000_8000_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ pixman_composite_add_8888_8888_process_pixblock_tail_head /******************************************************************************/ diff --git a/pixman/pixman/pixman-arm-neon.c b/pixman/pixman/pixman-arm-neon.c index 28a66751a..231a183aa 100644 --- a/pixman/pixman/pixman-arm-neon.c +++ b/pixman/pixman/pixman-arm-neon.c @@ -52,7 +52,7 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev, uint8_t, 3, uint16_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888, uint32_t, 1, uint32_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8000_8000, +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888, uint32_t, 1, uint32_t, 1) @@ -257,7 +257,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8), PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8000_8000), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888), diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S index 1a1a0d641..76647c6bc 100644 --- a/pixman/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman/pixman-arm-simd-asm.S @@ -1,330 +1,330 @@ -/* - * Copyright © 2008 Mozilla Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ - -/* Prevent the stack from becoming executable */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .arch armv6 - .object_arch armv4 - .arm - .altmacro - -/* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif -fname: -.endm - -/* - * The code below was generated by gcc 4.3.4 from the commented out - * functions in 'pixman-arm-simd.c' file with the following optimization - * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" - * - * TODO: replace gcc generated code with hand tuned versions because - * the code quality is not very good, introduce symbolic register - * aliases for better readability and maintainability. - */ - -pixman_asm_function pixman_composite_add_8000_8000_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - mov r10, r1 - sub sp, sp, #4 - subs r10, r10, #1 - mov r11, r0 - mov r8, r2 - str r3, [sp] - ldr r7, [sp, #36] - bcc 0f -6: cmp r11, #0 - beq 1f - orr r3, r8, r7 - tst r3, #3 - beq 2f - mov r1, r8 - mov r0, r7 - mov r12, r11 - b 3f -5: tst r3, #3 - beq 4f -3: ldrb r2, [r0], #1 - subs r12, r12, #1 - ldrb r3, [r1] - uqadd8 r3, r2, r3 - strb r3, [r1], #1 - orr r3, r1, r0 - bne 5b -1: ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 -10: subs r10, r10, #1 - bcs 6b -0: add sp, sp, #4 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -2: mov r12, r11 - mov r1, r8 - mov r0, r7 -4: cmp r12, #3 - subgt r6, r12, #4 - movgt r9, r12 - lsrgt r5, r6, #2 - addgt r3, r5, #1 - movgt r12, #0 - lslgt r4, r3, #2 - ble 7f -8: ldr r3, [r0, r12] - ldr r2, [r1, r12] - uqadd8 r3, r3, r2 - str r3, [r1, r12] - add r12, r12, #4 - cmp r12, r4 - bne 8b - sub r3, r9, #4 - bic r3, r3, #3 - add r3, r3, #4 - subs r12, r6, r5, lsl #2 - add r1, r1, r3 - add r0, r0, r3 - beq 1b -7: mov r4, #0 -9: ldrb r3, [r1, r4] - ldrb r2, [r0, r4] - uqadd8 r3, r2, r3 - strb r3, [r1, r4] - add r4, r4, #1 - cmp r4, r12 - bne 9b - ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 - b 10b -.endfunc - -pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #20 - cmp r1, #0 - mov r12, r2 - str r1, [sp, #12] - str r0, [sp, #16] - ldr r2, [sp, #52] - beq 0f - lsl r3, r3, #2 - str r3, [sp] - ldr r3, [sp, #56] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -6: ldr r11, [sp, #8] -1: ldr r9, [sp] - mov r0, r12 - add r12, r12, r9 - mov r1, r2 - str r12, [sp, #4] - add r2, r2, r11 - ldr r12, [sp, #16] - ldr r3, =0x00800080 - ldr r9, =0xff00ff00 - mov r11, #255 - cmp r12, #0 - beq 4f -5: ldr r5, [r1], #4 - ldr r4, [r0] - sub r8, r11, r5, lsr #24 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mla r6, r6, r8, r3 - mla r7, r7, r8, r3 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - and r7, r7, r9 - uxtab16 r6, r7, r6, ror #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 5b -4: ldr r3, [sp, #12] - add r10, r10, #1 - cmp r10, r3 - ldr r12, [sp, #4] - bne 6b -0: add sp, sp, #20 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc - -pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - str r1, [sp, #12] - ldrb r1, [sp, #71] - mov r12, r2 - str r0, [sp, #16] - ldr r2, [sp, #60] - str r1, [sp, #24] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #20] - ldr r3, [sp, #64] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -5: ldr r11, [sp, #8] -1: ldr r4, [sp, #20] - mov r0, r12 - mov r1, r2 - add r12, r12, r4 - add r2, r2, r11 - str r12, [sp] - str r2, [sp, #4] - ldr r12, [sp, #16] - ldr r2, =0x00800080 - ldr r3, [sp, #24] - mov r11, #255 - cmp r12, #0 - beq 3f -4: ldr r5, [r1], #4 - ldr r4, [r0] - uxtb16 r6, r5 - uxtb16 r7, r5, ror #8 - mla r6, r6, r3, r2 - mla r7, r7, r3, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - sub r8, r11, r5, lsr #24 - mla r6, r6, r8, r2 - mla r7, r7, r8, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 4b -3: ldr r1, [sp, #12] - add r10, r10, #1 - cmp r10, r1 - ldr r12, [sp] - ldr r2, [sp, #4] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc - -pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - ldr r9, [sp, #60] - str r1, [sp, #12] - bic r1, r9, #-16777216 - str r1, [sp, #20] - mov r12, r2 - lsr r1, r9, #8 - ldr r2, [sp, #20] - bic r1, r1, #-16777216 - bic r2, r2, #65280 - bic r1, r1, #65280 - str r2, [sp, #20] - str r0, [sp, #16] - str r1, [sp, #4] - ldr r2, [sp, #68] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #24] - mov r0, #0 - b 1f -5: ldr r3, [sp, #24] -1: ldr r4, [sp, #72] - mov r10, r12 - mov r1, r2 - add r12, r12, r3 - add r2, r2, r4 - str r12, [sp, #8] - str r2, [sp] - ldr r12, [sp, #16] - ldr r11, =0x00800080 - ldr r2, [sp, #4] - ldr r3, [sp, #20] - cmp r12, #0 - beq 3f -4: ldrb r5, [r1], #1 - ldr r4, [r10] - mla r6, r3, r5, r11 - mla r7, r2, r5, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mvn r8, r5 - lsr r8, r8, #24 - mla r6, r6, r8, r11 - mla r7, r7, r8, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r10], #4 - subs r12, r12, #1 - bne 4b -3: ldr r4, [sp, #12] - add r0, r0, #1 - cmp r0, r4 - ldr r12, [sp, #8] - ldr r2, [sp] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc +/* + * Copyright © 2008 Mozilla Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm + +/* + * The code below was generated by gcc 4.3.4 from the commented out + * functions in 'pixman-arm-simd.c' file with the following optimization + * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" + * + * TODO: replace gcc generated code with hand tuned versions because + * the code quality is not very good, introduce symbolic register + * aliases for better readability and maintainability. + */ + +pixman_asm_function pixman_composite_add_8_8_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + mov r10, r1 + sub sp, sp, #4 + subs r10, r10, #1 + mov r11, r0 + mov r8, r2 + str r3, [sp] + ldr r7, [sp, #36] + bcc 0f +6: cmp r11, #0 + beq 1f + orr r3, r8, r7 + tst r3, #3 + beq 2f + mov r1, r8 + mov r0, r7 + mov r12, r11 + b 3f +5: tst r3, #3 + beq 4f +3: ldrb r2, [r0], #1 + subs r12, r12, #1 + ldrb r3, [r1] + uqadd8 r3, r2, r3 + strb r3, [r1], #1 + orr r3, r1, r0 + bne 5b +1: ldr r3, [sp] + add r8, r8, r3 + ldr r3, [sp, #40] + add r7, r7, r3 +10: subs r10, r10, #1 + bcs 6b +0: add sp, sp, #4 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +2: mov r12, r11 + mov r1, r8 + mov r0, r7 +4: cmp r12, #3 + subgt r6, r12, #4 + movgt r9, r12 + lsrgt r5, r6, #2 + addgt r3, r5, #1 + movgt r12, #0 + lslgt r4, r3, #2 + ble 7f +8: ldr r3, [r0, r12] + ldr r2, [r1, r12] + uqadd8 r3, r3, r2 + str r3, [r1, r12] + add r12, r12, #4 + cmp r12, r4 + bne 8b + sub r3, r9, #4 + bic r3, r3, #3 + add r3, r3, #4 + subs r12, r6, r5, lsl #2 + add r1, r1, r3 + add r0, r0, r3 + beq 1b +7: mov r4, #0 +9: ldrb r3, [r1, r4] + ldrb r2, [r0, r4] + uqadd8 r3, r2, r3 + strb r3, [r1, r4] + add r4, r4, #1 + cmp r4, r12 + bne 9b + ldr r3, [sp] + add r8, r8, r3 + ldr r3, [sp, #40] + add r7, r7, r3 + b 10b +.endfunc + +pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #20 + cmp r1, #0 + mov r12, r2 + str r1, [sp, #12] + str r0, [sp, #16] + ldr r2, [sp, #52] + beq 0f + lsl r3, r3, #2 + str r3, [sp] + ldr r3, [sp, #56] + mov r10, #0 + lsl r3, r3, #2 + str r3, [sp, #8] + mov r11, r3 + b 1f +6: ldr r11, [sp, #8] +1: ldr r9, [sp] + mov r0, r12 + add r12, r12, r9 + mov r1, r2 + str r12, [sp, #4] + add r2, r2, r11 + ldr r12, [sp, #16] + ldr r3, =0x00800080 + ldr r9, =0xff00ff00 + mov r11, #255 + cmp r12, #0 + beq 4f +5: ldr r5, [r1], #4 + ldr r4, [r0] + sub r8, r11, r5, lsr #24 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + mla r6, r6, r8, r3 + mla r7, r7, r8, r3 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + and r7, r7, r9 + uxtab16 r6, r7, r6, ror #8 + uqadd8 r5, r6, r5 + str r5, [r0], #4 + subs r12, r12, #1 + bne 5b +4: ldr r3, [sp, #12] + add r10, r10, #1 + cmp r10, r3 + ldr r12, [sp, #4] + bne 6b +0: add sp, sp, #20 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc + +pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #28 + cmp r1, #0 + str r1, [sp, #12] + ldrb r1, [sp, #71] + mov r12, r2 + str r0, [sp, #16] + ldr r2, [sp, #60] + str r1, [sp, #24] + beq 0f + lsl r3, r3, #2 + str r3, [sp, #20] + ldr r3, [sp, #64] + mov r10, #0 + lsl r3, r3, #2 + str r3, [sp, #8] + mov r11, r3 + b 1f +5: ldr r11, [sp, #8] +1: ldr r4, [sp, #20] + mov r0, r12 + mov r1, r2 + add r12, r12, r4 + add r2, r2, r11 + str r12, [sp] + str r2, [sp, #4] + ldr r12, [sp, #16] + ldr r2, =0x00800080 + ldr r3, [sp, #24] + mov r11, #255 + cmp r12, #0 + beq 3f +4: ldr r5, [r1], #4 + ldr r4, [r0] + uxtb16 r6, r5 + uxtb16 r7, r5, ror #8 + mla r6, r6, r3, r2 + mla r7, r7, r3, r2 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r5, r6, r7, lsl #8 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + sub r8, r11, r5, lsr #24 + mla r6, r6, r8, r2 + mla r7, r7, r8, r2 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r6, r6, r7, lsl #8 + uqadd8 r5, r6, r5 + str r5, [r0], #4 + subs r12, r12, #1 + bne 4b +3: ldr r1, [sp, #12] + add r10, r10, #1 + cmp r10, r1 + ldr r12, [sp] + ldr r2, [sp, #4] + bne 5b +0: add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc + +pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #28 + cmp r1, #0 + ldr r9, [sp, #60] + str r1, [sp, #12] + bic r1, r9, #-16777216 + str r1, [sp, #20] + mov r12, r2 + lsr r1, r9, #8 + ldr r2, [sp, #20] + bic r1, r1, #-16777216 + bic r2, r2, #65280 + bic r1, r1, #65280 + str r2, [sp, #20] + str r0, [sp, #16] + str r1, [sp, #4] + ldr r2, [sp, #68] + beq 0f + lsl r3, r3, #2 + str r3, [sp, #24] + mov r0, #0 + b 1f +5: ldr r3, [sp, #24] +1: ldr r4, [sp, #72] + mov r10, r12 + mov r1, r2 + add r12, r12, r3 + add r2, r2, r4 + str r12, [sp, #8] + str r2, [sp] + ldr r12, [sp, #16] + ldr r11, =0x00800080 + ldr r2, [sp, #4] + ldr r3, [sp, #20] + cmp r12, #0 + beq 3f +4: ldrb r5, [r1], #1 + ldr r4, [r10] + mla r6, r3, r5, r11 + mla r7, r2, r5, r11 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r5, r6, r7, lsl #8 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + mvn r8, r5 + lsr r8, r8, #24 + mla r6, r6, r8, r11 + mla r7, r7, r8, r11 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r6, r6, r7, lsl #8 + uqadd8 r5, r6, r5 + str r5, [r10], #4 + subs r12, r12, #1 + bne 4b +3: ldr r4, [sp, #12] + add r0, r0, #1 + cmp r0, r4 + ldr r12, [sp, #8] + ldr r2, [sp] + bne 5b +0: add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c index 389c9e01a..76a7ffeab 100644 --- a/pixman/pixman/pixman-arm-simd.c +++ b/pixman/pixman/pixman-arm-simd.c @@ -1,417 +1,417 @@ -/* - * Copyright © 2008 Mozilla Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ -#ifdef HAVE_CONFIG_H -#include -#endif - -#include "pixman-private.h" -#include "pixman-arm-common.h" - -#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ - -void -pixman_composite_add_8000_8000_asm_armv6 (int32_t width, - int32_t height, - uint8_t *dst_line, - int32_t dst_stride, - uint8_t *src_line, - int32_t src_stride) -{ - uint8_t *dst, *src; - int32_t w; - uint8_t s, d; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - /* ensure both src and dst are properly aligned before doing 32 bit reads - * we'll stay in this loop if src and dst have differing alignments - */ - while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - - while (w >= 4) - { - asm ("uqadd8 %0, %1, %2" - : "=r" (*(uint32_t*)dst) - : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); - dst += 4; - src += 4; - w -= 4; - } - - while (w) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } - } - -} - -void -pixman_composite_over_8888_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t upper_component_mask = 0xff00ff00; - uint32_t alpha_mask = 0xff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - "ldr r4, [%[dest]] \n\t" - -#else - "ldr r4, [%[dest]] \n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" -#endif - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* multiply by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - /* recombine the 0xff00ff00 bytes of r6 and r7 */ - "and r7, r7, %[upper_component_mask]\n\t" - "uxtab16 r6, r7, r6, ror #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory" - ); - } -} - -void -pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride, - uint32_t mask) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t alpha_mask = 0xff; - - mask = (mask) >> 24; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - "uxtb16 r6, r5\n\t" - "uxtb16 r7, r5, ror #8\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, %[mask_alpha], %[component_half]\n\t" - "mla r7, r7, %[mask_alpha], %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [mask_alpha] "r" (mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" - ); - } -} - -void -pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t src, - int32_t unused, - uint8_t *mask_line, - int32_t mask_stride) -{ - uint32_t srca; - uint32_t *dst; - uint8_t *mask; - int32_t w; - - srca = src >> 24; - - uint32_t component_mask = 0xff00ff; - uint32_t component_half = 0x800080; - - uint32_t src_hi = (src >> 8) & component_mask; - uint32_t src_lo = src & component_mask; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load mask */ - "ldrb r5, [%[mask]], #1\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, %[src_lo], r5, %[component_half]\n\t" - "mla r7, %[src_hi], r5, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* we could simplify this to use 'sub' if we were - * willing to give up a register for alpha_mask - */ - "mvn r8, r5\n\t" - "mov r8, r8, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) - : [component_half] "r" (component_half), - [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); - } -} - -#endif - -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888, - uint8_t, 1, uint32_t, 1) - -static const pixman_fast_path_t arm_simd_fast_paths[] = -{ - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), - PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), - - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8000_8000), - - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), - PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), - - { PIXMAN_OP_NONE }, -}; - -pixman_implementation_t * -_pixman_implementation_create_arm_simd (void) -{ - pixman_implementation_t *general = _pixman_implementation_create_fast_path (); - pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths); - - return imp; -} +/* + * Copyright © 2008 Mozilla Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "pixman-private.h" +#include "pixman-arm-common.h" + +#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ + +void +pixman_composite_add_8_8_asm_armv6 (int32_t width, + int32_t height, + uint8_t *dst_line, + int32_t dst_stride, + uint8_t *src_line, + int32_t src_stride) +{ + uint8_t *dst, *src; + int32_t w; + uint8_t s, d; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + /* ensure both src and dst are properly aligned before doing 32 bit reads + * we'll stay in this loop if src and dst have differing alignments + */ + while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) + { + s = *src; + d = *dst; + asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); + *dst = d; + + dst++; + src++; + w--; + } + + while (w >= 4) + { + asm ("uqadd8 %0, %1, %2" + : "=r" (*(uint32_t*)dst) + : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + s = *src; + d = *dst; + asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); + *dst = d; + + dst++; + src++; + w--; + } + } + +} + +void +pixman_composite_over_8888_8888_asm_armv6 (int32_t width, + int32_t height, + uint32_t *dst_line, + int32_t dst_stride, + uint32_t *src_line, + int32_t src_stride) +{ + uint32_t *dst; + uint32_t *src; + int32_t w; + uint32_t component_half = 0x800080; + uint32_t upper_component_mask = 0xff00ff00; + uint32_t alpha_mask = 0xff; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + +/* #define inner_branch */ + asm volatile ( + "cmp %[w], #0\n\t" + "beq 2f\n\t" + "1:\n\t" + /* load src */ + "ldr r5, [%[src]], #4\n\t" +#ifdef inner_branch + /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. + * The 0x0 case also allows us to avoid doing an unecessary data + * write which is more valuable so we only check for that + */ + "cmp r5, #0\n\t" + "beq 3f\n\t" + + /* = 255 - alpha */ + "sub r8, %[alpha_mask], r5, lsr #24\n\t" + + "ldr r4, [%[dest]] \n\t" + +#else + "ldr r4, [%[dest]] \n\t" + + /* = 255 - alpha */ + "sub r8, %[alpha_mask], r5, lsr #24\n\t" +#endif + "uxtb16 r6, r4\n\t" + "uxtb16 r7, r4, ror #8\n\t" + + /* multiply by 257 and divide by 65536 */ + "mla r6, r6, r8, %[component_half]\n\t" + "mla r7, r7, r8, %[component_half]\n\t" + + "uxtab16 r6, r6, r6, ror #8\n\t" + "uxtab16 r7, r7, r7, ror #8\n\t" + + /* recombine the 0xff00ff00 bytes of r6 and r7 */ + "and r7, r7, %[upper_component_mask]\n\t" + "uxtab16 r6, r7, r6, ror #8\n\t" + + "uqadd8 r5, r6, r5\n\t" + +#ifdef inner_branch + "3:\n\t" + +#endif + "str r5, [%[dest]], #4\n\t" + /* increment counter and jmp to top */ + "subs %[w], %[w], #1\n\t" + "bne 1b\n\t" + "2:\n\t" + : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) + : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), + [alpha_mask] "r" (alpha_mask) + : "r4", "r5", "r6", "r7", "r8", "cc", "memory" + ); + } +} + +void +pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, + int32_t height, + uint32_t *dst_line, + int32_t dst_stride, + uint32_t *src_line, + int32_t src_stride, + uint32_t mask) +{ + uint32_t *dst; + uint32_t *src; + int32_t w; + uint32_t component_half = 0x800080; + uint32_t alpha_mask = 0xff; + + mask = (mask) >> 24; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + +/* #define inner_branch */ + asm volatile ( + "cmp %[w], #0\n\t" + "beq 2f\n\t" + "1:\n\t" + /* load src */ + "ldr r5, [%[src]], #4\n\t" +#ifdef inner_branch + /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. + * The 0x0 case also allows us to avoid doing an unecessary data + * write which is more valuable so we only check for that + */ + "cmp r5, #0\n\t" + "beq 3f\n\t" + +#endif + "ldr r4, [%[dest]] \n\t" + + "uxtb16 r6, r5\n\t" + "uxtb16 r7, r5, ror #8\n\t" + + /* multiply by alpha (r8) then by 257 and divide by 65536 */ + "mla r6, r6, %[mask_alpha], %[component_half]\n\t" + "mla r7, r7, %[mask_alpha], %[component_half]\n\t" + + "uxtab16 r6, r6, r6, ror #8\n\t" + "uxtab16 r7, r7, r7, ror #8\n\t" + + "uxtb16 r6, r6, ror #8\n\t" + "uxtb16 r7, r7, ror #8\n\t" + + /* recombine */ + "orr r5, r6, r7, lsl #8\n\t" + + "uxtb16 r6, r4\n\t" + "uxtb16 r7, r4, ror #8\n\t" + + /* 255 - alpha */ + "sub r8, %[alpha_mask], r5, lsr #24\n\t" + + /* multiply by alpha (r8) then by 257 and divide by 65536 */ + "mla r6, r6, r8, %[component_half]\n\t" + "mla r7, r7, r8, %[component_half]\n\t" + + "uxtab16 r6, r6, r6, ror #8\n\t" + "uxtab16 r7, r7, r7, ror #8\n\t" + + "uxtb16 r6, r6, ror #8\n\t" + "uxtb16 r7, r7, ror #8\n\t" + + /* recombine */ + "orr r6, r6, r7, lsl #8\n\t" + + "uqadd8 r5, r6, r5\n\t" + +#ifdef inner_branch + "3:\n\t" + +#endif + "str r5, [%[dest]], #4\n\t" + /* increment counter and jmp to top */ + "subs %[w], %[w], #1\n\t" + "bne 1b\n\t" + "2:\n\t" + : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) + : [component_half] "r" (component_half), [mask_alpha] "r" (mask), + [alpha_mask] "r" (alpha_mask) + : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" + ); + } +} + +void +pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, + int32_t height, + uint32_t *dst_line, + int32_t dst_stride, + uint32_t src, + int32_t unused, + uint8_t *mask_line, + int32_t mask_stride) +{ + uint32_t srca; + uint32_t *dst; + uint8_t *mask; + int32_t w; + + srca = src >> 24; + + uint32_t component_mask = 0xff00ff; + uint32_t component_half = 0x800080; + + uint32_t src_hi = (src >> 8) & component_mask; + uint32_t src_lo = src & component_mask; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + +/* #define inner_branch */ + asm volatile ( + "cmp %[w], #0\n\t" + "beq 2f\n\t" + "1:\n\t" + /* load mask */ + "ldrb r5, [%[mask]], #1\n\t" +#ifdef inner_branch + /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. + * The 0x0 case also allows us to avoid doing an unecessary data + * write which is more valuable so we only check for that + */ + "cmp r5, #0\n\t" + "beq 3f\n\t" + +#endif + "ldr r4, [%[dest]] \n\t" + + /* multiply by alpha (r8) then by 257 and divide by 65536 */ + "mla r6, %[src_lo], r5, %[component_half]\n\t" + "mla r7, %[src_hi], r5, %[component_half]\n\t" + + "uxtab16 r6, r6, r6, ror #8\n\t" + "uxtab16 r7, r7, r7, ror #8\n\t" + + "uxtb16 r6, r6, ror #8\n\t" + "uxtb16 r7, r7, ror #8\n\t" + + /* recombine */ + "orr r5, r6, r7, lsl #8\n\t" + + "uxtb16 r6, r4\n\t" + "uxtb16 r7, r4, ror #8\n\t" + + /* we could simplify this to use 'sub' if we were + * willing to give up a register for alpha_mask + */ + "mvn r8, r5\n\t" + "mov r8, r8, lsr #24\n\t" + + /* multiply by alpha (r8) then by 257 and divide by 65536 */ + "mla r6, r6, r8, %[component_half]\n\t" + "mla r7, r7, r8, %[component_half]\n\t" + + "uxtab16 r6, r6, r6, ror #8\n\t" + "uxtab16 r7, r7, r7, ror #8\n\t" + + "uxtb16 r6, r6, ror #8\n\t" + "uxtb16 r7, r7, ror #8\n\t" + + /* recombine */ + "orr r6, r6, r7, lsl #8\n\t" + + "uqadd8 r5, r6, r5\n\t" + +#ifdef inner_branch + "3:\n\t" + +#endif + "str r5, [%[dest]], #4\n\t" + /* increment counter and jmp to top */ + "subs %[w], %[w], #1\n\t" + "bne 1b\n\t" + "2:\n\t" + : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) + : [component_half] "r" (component_half), + [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) + : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); + } +} + +#endif + +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, + uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, + uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888, + uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888, + uint8_t, 1, uint32_t, 1) + +static const pixman_fast_path_t arm_simd_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), + + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8), + + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), + + { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_arm_simd (void) +{ + pixman_implementation_t *general = _pixman_implementation_create_fast_path (); + pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths); + + return imp; +} diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c index 0b8a2526e..25ef9243b 100644 --- a/pixman/pixman/pixman-fast-path.c +++ b/pixman/pixman/pixman-fast-path.c @@ -910,19 +910,19 @@ fast_composite_src_x888_0565 (pixman_implementation_t *imp, } static void -fast_composite_add_8000_8000 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) +fast_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) { uint8_t *dst_line, *dst; uint8_t *src_line, *src; @@ -1602,7 +1602,7 @@ static const pixman_fast_path_t c_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8000_8000), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000), PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), diff --git a/pixman/pixman/pixman-mmx.c b/pixman/pixman/pixman-mmx.c index c0c53c1f5..8284fe480 100644 --- a/pixman/pixman/pixman-mmx.c +++ b/pixman/pixman/pixman-mmx.c @@ -2845,19 +2845,19 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp, } static void -mmx_composite_add_8000_8000 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) +mmx_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) { uint8_t *dst_line, *dst; uint8_t *src_line, *src; @@ -3268,7 +3268,7 @@ static const pixman_fast_path_t mmx_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8000_8000 ), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c index 8e175b78d..9d9b16543 100644 --- a/pixman/pixman/pixman-sse2.c +++ b/pixman/pixman/pixman-sse2.c @@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo, over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static force_inline void -cache_prefetch (__m128i* addr) -{ - _mm_prefetch ((void const*)addr, _MM_HINT_T0); -} - -static force_inline void -cache_prefetch_next (__m128i* addr) -{ - _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ -} - -/* prefetching NULL is very slow on some systems. don't do that. */ - -static force_inline void -maybe_prefetch (__m128i* addr) -{ - if (addr) - cache_prefetch (addr); -} - -static force_inline void -maybe_prefetch_next (__m128i* addr) -{ - if (addr) - cache_prefetch_next (addr); -} - /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure about * the address alignment. */ @@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* ps = src; const uint32_t* pm = mask; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i s; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( @@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd, uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd, __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); @@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, w--; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, @@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = *src++; @@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { *dst++ = *src++ | 0xff000000; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w >= 16) { __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - xmm_src1 = load_128_unaligned ((__m128i*)src + 0); xmm_src2 = load_128_unaligned ((__m128i*)src + 1); xmm_src3 = load_128_unaligned ((__m128i*)src + 2); @@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; @@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - /* It's a 8 pixel loop */ while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits, return FALSE; } - cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) @@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits, byte_line += stride; w = byte_width; - cache_prefetch_next ((__m128i*)d); - while (w >= 1 && ((unsigned long)d & 1)) { *(uint8_t *)d = data; @@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits, d += 4; } - cache_prefetch_next ((__m128i*)d); - while (w >= 128) { - cache_prefetch (((__m128i*)d) + 12); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits, if (w >= 64) { - cache_prefetch (((__m128i*)d) + 8); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 64; } - cache_prefetch_next ((__m128i*)d); - if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 16; } - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = data; @@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { m = *mask++; @@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); @@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); @@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask_line += mask_stride; dst_line += dst_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; @@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { d = (uint32_t) *dst; @@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; @@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { *dst = (uint8_t)_mm_cvtsi64_si32 ( @@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - save_128_aligned ( (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); @@ -5485,23 +4902,23 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, } /* ---------------------------------------------------------------------- - * composite_add_8000_8000 + * composite_add_8_8 */ static void -sse2_composite_add_8000_8000 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) +sse2_composite_add_8_8 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) { uint8_t *dst_line, *dst; uint8_t *src_line, *src; @@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits, return FALSE; } - cache_prefetch ((__m128i*)src_bytes); - cache_prefetch ((__m128i*)dst_bytes); - while (height--) { int w; @@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; @@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits, d += 4; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; - /* 128 bytes ahead */ - cache_prefetch (((__m128i*)s) + 8); - cache_prefetch (((__m128i*)d) + 8); - xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); @@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits, w -= 64; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); @@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits, s += 16; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; @@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; @@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)mask); - m = *(uint32_t*) mask; xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); @@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - m = *(uint32_t *) mask; if (m) @@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, dst++; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { __m128i tmp_lo, tmp_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)(dst + 4)); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - xmm_mask = load_128_unaligned ((__m128i*)mask); if (!is_transparent (xmm_mask)) @@ -6504,7 +5845,7 @@ static const pixman_fast_path_t sse2_fast_paths[] = /* PIXMAN_OP_ADD */ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), - PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c index d4df81507..343aafee9 100644 --- a/pixman/test/lowlevel-blt-bench.c +++ b/pixman/test/lowlevel-blt-bench.c @@ -544,7 +544,7 @@ struct tests_tbl[] = { { "add_8_8_8", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 }, - { "add_n_8_8000", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 }, + { "add_n_8_8", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 }, { "add_n_8_8888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8r8g8b8 }, { "add_n_8_x888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_x8r8g8b8 }, { "add_n_8_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_r5g6b5 }, @@ -553,7 +553,7 @@ tests_tbl[] = { "add_n_8_2222", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a2r2g2b2 }, { "add_n_8_2x10", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_x2r10g10b10 }, { "add_n_8_2a10", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a2r10g10b10 }, - { "add_n_8000", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 }, + { "add_n_8", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 }, { "add_n_8888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8r8g8b8 }, { "add_n_x888", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x8r8g8b8 }, { "add_n_0565", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_r5g6b5 }, @@ -562,7 +562,7 @@ tests_tbl[] = { "add_n_2222", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a2r2g2b2 }, { "add_n_2x10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x2r10g10b10 }, { "add_n_2a10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a2r10g10b10 }, - { "add_8000_8000", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 }, + { "add_8_8", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8 }, { "add_x888_x888", PIXMAN_x8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x8r8g8b8 }, { "add_8888_8888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a8r8g8b8 }, { "add_8888_0565", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_r5g6b5 }, -- cgit v1.2.3