diff options
Diffstat (limited to 'pixman')
-rw-r--r-- | pixman/Makefile.am | 4 | ||||
-rw-r--r-- | pixman/configure.ac | 2 | ||||
-rw-r--r-- | pixman/pixman/Makefile.am | 4 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm-scaled.S | 165 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm.S | 981 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd-asm.h | 908 | ||||
-rw-r--r-- | pixman/pixman/pixman-arm-simd.c | 505 | ||||
-rw-r--r-- | pixman/pixman/pixman-combine-float.c | 30 | ||||
-rw-r--r-- | pixman/test/lowlevel-blt-bench.c | 31 | ||||
-rw-r--r-- | pixman/test/stress-test.c | 123 |
10 files changed, 1928 insertions, 825 deletions
diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 88ff897be..5137c9ea3 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -10,7 +10,7 @@ snapshot: test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \ $(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist -GPGKEY=6FF7C1A8 +GPGKEY=3892336E USERNAME=$$USER RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi) RELEASE_CAIRO_HOST = $(USERNAME)@cairographics.org @@ -121,7 +121,7 @@ release-publish-message: $(HASHFILES) ensure-prev @echo "" @echo "GPG signature:" @echo " $(RELEASE_CAIRO_URL)/$(gpg_file)" - @echo " (signed by `git config --get user.name` <`git config --get user.email`>)" + @echo " (signed by`gpg --list-keys $(GPGKEY) | grep uid | cut -b4- | tr -s " "`)" @echo "" @echo "Git:" @echo " git://git.freedesktop.org/git/pixman" diff --git a/pixman/configure.ac b/pixman/configure.ac index a93e2905b..38f89b31e 100644 --- a/pixman/configure.ac +++ b/pixman/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 29) -m4_define([pixman_micro], 1) +m4_define([pixman_micro], 3) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am index beebdd086..b9ea75424 100644 --- a/pixman/pixman/Makefile.am +++ b/pixman/pixman/Makefile.am @@ -58,7 +58,9 @@ noinst_LTLIBRARIES += libpixman-arm-simd.la libpixman_arm_simd_la_SOURCES = \ pixman-arm-simd.c \ pixman-arm-common.h \ - pixman-arm-simd-asm.S + pixman-arm-simd-asm.S \ + pixman-arm-simd-asm-scaled.S \ + pixman-arm-simd-asm.h libpixman_1_la_LIBADD += libpixman-arm-simd.la ASM_CFLAGS_arm_simd= diff --git a/pixman/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman/pixman-arm-simd-asm-scaled.S new file mode 100644 index 000000000..711099548 --- /dev/null +++ b/pixman/pixman/pixman-arm-simd-asm-scaled.S @@ -0,0 +1,165 @@ +/* + * Copyright © 2008 Mozilla Corporation + * Copyright © 2010 Nokia Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm + +/* + * Note: This code is only using armv5te instructions (not even armv6), + * but is scheduled for ARM Cortex-A8 pipeline. So it might need to + * be split into a few variants, tuned for each microarchitecture. + * + * TODO: In order to get good performance on ARM9/ARM11 cores (which don't + * have efficient write combining), it needs to be changed to use 16-byte + * aligned writes using STM instruction. + * + * Nearest scanline scaler macro template uses the following arguments: + * fname - name of the function to generate + * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes + * t - type suffix for LDR/STR instructions + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + * prefetch_braking_distance - stop prefetching when that many pixels are + * remaining before the end of scanline + */ + +.macro generate_nearest_scanline_func fname, bpp_shift, t, \ + prefetch_distance, \ + prefetch_braking_distance + +pixman_asm_function fname + W .req r0 + DST .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 + PF_OFFS .req r7 + SRC_WIDTH_FIXED .req r8 + + ldr UNIT_X, [sp] + push {r4, r5, r6, r7, r8, r10} + mvn VXMASK, #((1 << bpp_shift) - 1) + ldr SRC_WIDTH_FIXED, [sp, #28] + + /* define helper macro */ + .macro scale_2_pixels + ldr&t TMP1, [SRC, TMP1] + and TMP2, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X + str&t TMP1, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + + ldr&t TMP2, [SRC, TMP2] + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X + str&t TMP2, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + .endm + + /* now do the scaling */ + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b + subs W, W, #(8 + prefetch_braking_distance) + blt 2f + /* calculate prefetch offset */ + mov PF_OFFS, #prefetch_distance + mla PF_OFFS, UNIT_X, PF_OFFS, VX +1: /* main loop, process 8 pixels per iteration with prefetch */ + pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] + add PF_OFFS, UNIT_X, lsl #3 + scale_2_pixels + scale_2_pixels + scale_2_pixels + scale_2_pixels + subs W, W, #8 + bge 1b +2: + subs W, W, #(4 - 8 - prefetch_braking_distance) + blt 2f +1: /* process the remaining pixels */ + scale_2_pixels + scale_2_pixels + subs W, W, #4 + bge 1b +2: + tst W, #2 + beq 2f + scale_2_pixels +2: + tst W, #1 + ldrne&t TMP1, [SRC, TMP1] + strne&t TMP1, [DST] + /* cleanup helper macro */ + .purgem scale_2_pixels + .unreq DST + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq VXMASK + .unreq PF_OFFS + .unreq SRC_WIDTH_FIXED + /* return */ + pop {r4, r5, r6, r7, r8, r10} + bx lr +.endfunc +.endm + +generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 + +generate_nearest_scanline_func \ + pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 diff --git a/pixman/pixman/pixman-arm-simd-asm.S b/pixman/pixman/pixman-arm-simd-asm.S index b438001d3..c20968879 100644 --- a/pixman/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman/pixman-arm-simd-asm.S @@ -1,14 +1,14 @@ /* - * Copyright © 2008 Mozilla Corporation - * Copyright © 2010 Nokia Corporation + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in + * documentation, and that the name of the copyright holders not be used in * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no + * specific, written prior permission. The copyright holders make no * representations about the suitability of this software for any purpose. It * is provided "as is" without express or implied warranty. * @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * - * Author: Jeff Muizelaar (jeff@infidigm.net) + * Author: Ben Avison (bavison@riscosopen.org) * */ @@ -37,412 +37,577 @@ .altmacro .p2align 2 -/* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif -fname: -.endm +#include "pixman-arm-simd-asm.h" -/* - * The code below was generated by gcc 4.3.4 from the commented out - * functions in 'pixman-arm-simd.c' file with the following optimization - * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" - * - * TODO: replace gcc generated code with hand tuned versions because - * the code quality is not very good, introduce symbolic register - * aliases for better readability and maintainability. +/* A head macro should do all processing which results in an output of up to + * 16 bytes, as far as the final load instruction. The corresponding tail macro + * should complete the processing of the up-to-16 bytes. The calling macro will + * sometimes choose to insert a preload or a decrement of X between them. + * cond ARM condition code for code block + * numbytes Number of output bytes that should be generated this time + * firstreg First WK register in which to place output + * unaligned_src Whether to use non-wordaligned loads of source image + * unaligned_mask Whether to use non-wordaligned loads of mask image + * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output */ -pixman_asm_function pixman_composite_add_8_8_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - mov r10, r1 - sub sp, sp, #4 - subs r10, r10, #1 - mov r11, r0 - mov r8, r2 - str r3, [sp] - ldr r7, [sp, #36] - bcc 0f -6: cmp r11, #0 - beq 1f - orr r3, r8, r7 - tst r3, #3 - beq 2f - mov r1, r8 - mov r0, r7 - mov r12, r11 - b 3f -5: tst r3, #3 - beq 4f -3: ldrb r2, [r0], #1 - subs r12, r12, #1 - ldrb r3, [r1] - uqadd8 r3, r2, r3 - strb r3, [r1], #1 - orr r3, r1, r0 - bne 5b -1: ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 -10: subs r10, r10, #1 - bcs 6b -0: add sp, sp, #4 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -2: mov r12, r11 - mov r1, r8 - mov r0, r7 -4: cmp r12, #3 - subgt r6, r12, #4 - movgt r9, r12 - lsrgt r5, r6, #2 - addgt r3, r5, #1 - movgt r12, #0 - lslgt r4, r3, #2 - ble 7f -8: ldr r3, [r0, r12] - ldr r2, [r1, r12] - uqadd8 r3, r3, r2 - str r3, [r1, r12] - add r12, r12, #4 - cmp r12, r4 - bne 8b - sub r3, r9, #4 - bic r3, r3, #3 - add r3, r3, #4 - subs r12, r6, r5, lsl #2 - add r1, r1, r3 - add r0, r0, r3 - beq 1b -7: mov r4, #0 -9: ldrb r3, [r1, r4] - ldrb r2, [r0, r4] - uqadd8 r3, r2, r3 - strb r3, [r1, r4] - add r4, r4, #1 - cmp r4, r12 - bne 9b - ldr r3, [sp] - add r8, r8, r3 - ldr r3, [sp, #40] - add r7, r7, r3 - b 10b -.endfunc - -pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #20 - cmp r1, #0 - mov r12, r2 - str r1, [sp, #12] - str r0, [sp, #16] - ldr r2, [sp, #52] - beq 0f - lsl r3, r3, #2 - str r3, [sp] - ldr r3, [sp, #56] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -6: ldr r11, [sp, #8] -1: ldr r9, [sp] - mov r0, r12 - add r12, r12, r9 - mov r1, r2 - str r12, [sp, #4] - add r2, r2, r11 - ldr r12, [sp, #16] - ldr r3, =0x00800080 - ldr r9, =0xff00ff00 - mov r11, #255 - cmp r12, #0 - beq 4f -5: ldr r5, [r1], #4 - ldr r4, [r0] - sub r8, r11, r5, lsr #24 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mla r6, r6, r8, r3 - mla r7, r7, r8, r3 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - and r7, r7, r9 - uxtab16 r6, r7, r6, ror #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 5b -4: ldr r3, [sp, #12] - add r10, r10, #1 - cmp r10, r3 - ldr r12, [sp, #4] - bne 6b -0: add sp, sp, #20 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc - -pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - str r1, [sp, #12] - ldrb r1, [sp, #71] - mov r12, r2 - str r0, [sp, #16] - ldr r2, [sp, #60] - str r1, [sp, #24] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #20] - ldr r3, [sp, #64] - mov r10, #0 - lsl r3, r3, #2 - str r3, [sp, #8] - mov r11, r3 - b 1f -5: ldr r11, [sp, #8] -1: ldr r4, [sp, #20] - mov r0, r12 - mov r1, r2 - add r12, r12, r4 - add r2, r2, r11 - str r12, [sp] - str r2, [sp, #4] - ldr r12, [sp, #16] - ldr r2, =0x00800080 - ldr r3, [sp, #24] - mov r11, #255 - cmp r12, #0 - beq 3f -4: ldr r5, [r1], #4 - ldr r4, [r0] - uxtb16 r6, r5 - uxtb16 r7, r5, ror #8 - mla r6, r6, r3, r2 - mla r7, r7, r3, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - sub r8, r11, r5, lsr #24 - mla r6, r6, r8, r2 - mla r7, r7, r8, r2 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r0], #4 - subs r12, r12, #1 - bne 4b -3: ldr r1, [sp, #12] - add r10, r10, #1 - cmp r10, r1 - ldr r12, [sp] - ldr r2, [sp, #4] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc - -pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 - push {r4, r5, r6, r7, r8, r9, r10, r11} - sub sp, sp, #28 - cmp r1, #0 - ldr r9, [sp, #60] - str r1, [sp, #12] - bic r1, r9, #-16777216 - str r1, [sp, #20] - mov r12, r2 - lsr r1, r9, #8 - ldr r2, [sp, #20] - bic r1, r1, #-16777216 - bic r2, r2, #65280 - bic r1, r1, #65280 - str r2, [sp, #20] - str r0, [sp, #16] - str r1, [sp, #4] - ldr r2, [sp, #68] - beq 0f - lsl r3, r3, #2 - str r3, [sp, #24] - mov r0, #0 - b 1f -5: ldr r3, [sp, #24] -1: ldr r4, [sp, #72] - mov r10, r12 - mov r1, r2 - add r12, r12, r3 - add r2, r2, r4 - str r12, [sp, #8] - str r2, [sp] - ldr r12, [sp, #16] - ldr r11, =0x00800080 - ldr r2, [sp, #4] - ldr r3, [sp, #20] - cmp r12, #0 - beq 3f -4: ldrb r5, [r1], #1 - ldr r4, [r10] - mla r6, r3, r5, r11 - mla r7, r2, r5, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r5, r6, r7, lsl #8 - uxtb16 r6, r4 - uxtb16 r7, r4, ror #8 - mvn r8, r5 - lsr r8, r8, #24 - mla r6, r6, r8, r11 - mla r7, r7, r8, r11 - uxtab16 r6, r6, r6, ror #8 - uxtab16 r7, r7, r7, ror #8 - uxtb16 r6, r6, ror #8 - uxtb16 r7, r7, ror #8 - orr r6, r6, r7, lsl #8 - uqadd8 r5, r6, r5 - str r5, [r10], #4 - subs r12, r12, #1 - bne 4b -3: ldr r4, [sp, #12] - add r0, r0, #1 - cmp r0, r4 - ldr r12, [sp, #8] - ldr r2, [sp] - bne 5b -0: add sp, sp, #28 - pop {r4, r5, r6, r7, r8, r9, r10, r11} - bx lr -.endfunc +.macro blit_init + line_saved_regs STRIDE_D, STRIDE_S +.endm -/* - * Note: This code is only using armv5te instructions (not even armv6), - * but is scheduled for ARM Cortex-A8 pipeline. So it might need to - * be split into a few variants, tuned for each microarchitecture. - * - * TODO: In order to get good performance on ARM9/ARM11 cores (which don't - * have efficient write combining), it needs to be changed to use 16-byte - * aligned writes using STM instruction. - * - * Nearest scanline scaler macro template uses the following arguments: - * fname - name of the function to generate - * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes - * t - type suffix for LDR/STR instructions - * prefetch_distance - prefetch in the source image by that many - * pixels ahead - * prefetch_braking_distance - stop prefetching when that many pixels are - * remaining before the end of scanline +.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M +110: pixld , 16, 0, SRC, unaligned_src + pixld , 16, 4, SRC, unaligned_src + pld [SRC, SCRATCH] + pixst , 16, 0, DST + pixst , 16, 4, DST + subs X, X, #32*8/src_bpp + bhs 110b + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 4, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +generate_composite_function \ + pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 4, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +generate_composite_function \ + pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ + blit_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + blit_process_head, \ + nop_macro, /* process tail */ \ + blit_inner_loop + +/******************************************************************************/ + +.macro src_n_8888_init + ldr SRC, [sp, #ARGS_STACK_OFFSET] + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro src_n_0565_init + ldrh SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro src_n_8_init + ldrb SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #8 + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro fill_process_tail cond, numbytes, firstreg + WK4 .req SRC + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M + pixst cond, numbytes, 4, DST + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_8888_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +generate_composite_function \ + pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_0565_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +generate_composite_function \ + pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + src_n_8_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + fill_process_tail + +/******************************************************************************/ + +.macro src_x888_8888_pixel, cond, reg + orr&cond WK®, WK®, #0xFF000000 +.endm + +.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg + src_x888_8888_pixel cond, %(firstreg+0) + .if numbytes >= 8 + src_x888_8888_pixel cond, %(firstreg+1) + .if numbytes == 16 + src_x888_8888_pixel cond, %(firstreg+2) + src_x888_8888_pixel cond, %(firstreg+3) + .endif + .endif +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 3, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + pixman_composite_src_x888_8888_process_head, \ + pixman_composite_src_x888_8888_process_tail + +/******************************************************************************/ + +.macro src_0565_8888_init + /* Hold loop invariants in MASK and STRIDE_M */ + ldr MASK, =0x07E007E0 + mov STRIDE_M, #0xFF000000 + /* Set GE[3:0] to 1010 so SEL instructions do what we want */ + ldr SCRATCH, =0x80008000 + uadd8 SCRATCH, SCRATCH, SCRATCH +.endm + +.macro src_0565_8888_2pixels, reg1, reg2 + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG + bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 + orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 + orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 + pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- + sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg + pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- + sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- + orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb + orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +.endm + +/* This version doesn't need STRIDE_M, but is one instruction longer. + It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? + and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 + bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb + mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 + mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB + orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB + pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB + sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +*/ + +.macro src_0565_8888_1pixel, reg + bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb + and WK®, WK®, MASK @ 000000000000000000000gggggg00000 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb + orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb +.endm + +.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src + .elseif numbytes == 8 + pixld , 4, firstreg, SRC, unaligned_src + .elseif numbytes == 4 + pixld , 2, firstreg, SRC, unaligned_src + .endif +.endm + +.macro src_0565_8888_process_tail cond, numbytes, firstreg + .if numbytes == 16 + src_0565_8888_2pixels firstreg, %(firstreg+1) + src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + src_0565_8888_2pixels firstreg, %(firstreg+1) + .else + src_0565_8888_1pixel firstreg + .endif +.endm + +generate_composite_function \ + pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 3, /* prefetch distance */ \ + src_0565_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + src_0565_8888_process_head, \ + src_0565_8888_process_tail + +/******************************************************************************/ + +.macro add_8_8_8pixels cond, dst1, dst2 + uqadd8&cond WK&dst1, WK&dst1, MASK + uqadd8&cond WK&dst2, WK&dst2, STRIDE_M +.endm + +.macro add_8_8_4pixels cond, dst + uqadd8&cond WK&dst, WK&dst, MASK +.endm + +.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req MASK + WK5 .req STRIDE_M + .if numbytes == 16 + pixld cond, 8, 4, SRC, unaligned_src + pixld cond, 16, firstreg, DST, 0 + add_8_8_8pixels cond, firstreg, %(firstreg+1) + pixld cond, 8, 4, SRC, unaligned_src + .else + pixld cond, numbytes, 4, SRC, unaligned_src + pixld cond, numbytes, firstreg, DST, 0 + .endif + .unreq WK4 + .unreq WK5 +.endm + +.macro add_8_8_process_tail cond, numbytes, firstreg + .if numbytes == 16 + add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 + add_8_8_8pixels cond, firstreg, %(firstreg+1) + .else + add_8_8_4pixels cond, firstreg + .endif +.endm + +generate_composite_function \ + pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 2, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + add_8_8_process_head, \ + add_8_8_process_tail + +/******************************************************************************/ + +.macro over_8888_8888_init + /* Hold loop invariant in MASK */ + ldr MASK, =0x00800080 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, MASK, MASK + line_saved_regs STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W + pixld , numbytes, %(4+firstreg), SRC, unaligned_src + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 + /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ + teq WK®0, #0 + .if numbytes > 4 + teqeq WK®1, #0 + .if numbytes > 8 + teqeq WK®2, #0 + teqeq WK®3, #0 + .endif + .endif +.endm + +.macro over_8888_8888_prepare next + mov WK&next, WK&next, lsr #24 +.endm + +.macro over_8888_8888_1pixel src, dst, offset, next + /* src = destination component multiplier */ + rsb WK&src, WK&src, #255 + /* Split even/odd bytes of dst into SCRATCH/dst */ + uxtb16 SCRATCH, WK&dst + uxtb16 WK&dst, WK&dst, ror #8 + /* Multiply through, adding 0.5 to the upper byte of result for rounding */ + mla SCRATCH, SCRATCH, WK&src, MASK + mla WK&dst, WK&dst, WK&src, MASK + /* Where we would have had a stall between the result of the first MLA and the shifter input, + * reload the complete source pixel */ + ldr WK&src, [SRC, #offset] + /* Multiply by 257/256 to approximate 256/255 */ + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + /* In this stall, start processing the next pixel */ + .if offset < -4 + mov WK&next, WK&next, lsr #24 + .endif + uxtab16 WK&dst, WK&dst, WK&dst, ror #8 + /* Recombine even/odd bytes of multiplied destination */ + mov SCRATCH, SCRATCH, ror #8 + sel WK&dst, SCRATCH, WK&dst + /* Saturated add of source to multiplied destination */ + uqadd8 WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_D + WK5 .req STRIDE_S + WK6 .req STRIDE_M + WK7 .req ORIG_W + over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) + beq 10f + over_8888_8888_prepare %(4+firstreg) + .set PROCESS_REG, firstreg + .set PROCESS_OFF, -numbytes + .rept numbytes / 4 + over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .set PROCESS_OFF, PROCESS_OFF+4 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_8888_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + over_8888_8888_process_head, \ + over_8888_8888_process_tail + +/******************************************************************************/ + +/* Multiply each byte of a word by a byte. + * Useful when there aren't any obvious ways to fill the stalls with other instructions. + * word Register containing 4 bytes + * byte Register containing byte multiplier (bits 8-31 must be 0) + * tmp Scratch register + * half Register containing the constant 0x00800080 + * GE[3:0] bits must contain 0101 */ +.macro mul_8888_8 word, byte, tmp, half + /* Split even/odd bytes of word apart */ + uxtb16 tmp, word + uxtb16 word, word, ror #8 + /* Multiply bytes together with rounding, then by 257/256 */ + mla tmp, tmp, byte, half + mla word, word, byte, half /* 1 stall follows */ + uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ + uxtab16 word, word, word, ror #8 + /* Recombine bytes */ + mov tmp, tmp, ror #8 + sel word, tmp, word +.endm + +/******************************************************************************/ + +.macro over_8888_n_8888_init + /* Mask is constant */ + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + /* Hold loop invariant in STRIDE_M */ + ldr STRIDE_M, =0x00800080 + /* We only want the alpha bits of the constant mask */ + mov MASK, MASK, lsr #24 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, STRIDE_M, STRIDE_M + line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W + pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +.macro over_8888_n_8888_1pixel src, dst + mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M + sub WK7, WK6, WK&src, lsr #24 + mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M + uqadd8 WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_n_8888_process_tail cond, numbytes, firstreg + WK4 .req Y + WK5 .req STRIDE_D + WK6 .req STRIDE_S + WK7 .req ORIG_W + over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) + beq 10f + mov WK6, #255 + .set PROCESS_REG, firstreg + .rept numbytes / 4 + .if numbytes == 16 && PROCESS_REG == 2 + /* We're using WK6 and WK7 as temporaries, so half way through + * 4 pixels, reload the second two source pixels but this time + * into WK4 and WK5 */ + ldmdb SRC, {WK4, WK5} + .endif + over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_8888_n_8888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + over_8888_n_8888_process_head, \ + over_8888_n_8888_process_tail + +/******************************************************************************/ + +.macro over_n_8_8888_init + /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ + ldr SRC, [sp, #ARGS_STACK_OFFSET] + /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ + ldr SCRATCH, =0x00800080 + uxtb16 STRIDE_S, SRC + uxtb16 SRC, SRC, ror #8 + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + uadd8 SCRATCH, SCRATCH, SCRATCH + line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W +.endm + +.macro over_n_8_8888_newline + ldr STRIDE_D, =0x00800080 + b 1f + .ltorg +1: +.endm + +.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + WK4 .req STRIDE_M + pixld , numbytes/4, 4, MASK, unaligned_mask + pixld , numbytes, firstreg, DST, 0 + .unreq WK4 +.endm + +.macro over_n_8_8888_1pixel src, dst + uxtb Y, WK4, ror #src*8 + /* Trailing part of multiplication of source */ + mla SCRATCH, STRIDE_S, Y, STRIDE_D + mla Y, SRC, Y, STRIDE_D + mov ORIG_W, #255 + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 + uxtab16 Y, Y, Y, ror #8 + mov SCRATCH, SCRATCH, ror #8 + sub ORIG_W, ORIG_W, Y, lsr #24 + sel Y, SCRATCH, Y + /* Then multiply the destination */ + mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D + uqadd8 WK&dst, WK&dst, Y +.endm + +.macro over_n_8_8888_process_tail cond, numbytes, firstreg + WK4 .req STRIDE_M + teq WK4, #0 + beq 10f + .set PROCESS_REG, firstreg + .rept numbytes / 4 + over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) + .set PROCESS_REG, PROCESS_REG+1 + .endr + pixst , numbytes, firstreg, DST +10: + .unreq WK4 +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ + 2, /* prefetch distance */ \ + over_n_8_8888_init, \ + over_n_8_8888_newline, \ + nop_macro, /* cleanup */ \ + over_n_8_8888_process_head, \ + over_n_8_8888_process_tail + +/******************************************************************************/ -.macro generate_nearest_scanline_func fname, bpp_shift, t, \ - prefetch_distance, \ - prefetch_braking_distance - -pixman_asm_function fname - W .req r0 - DST .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - TMP1 .req r4 - TMP2 .req r5 - VXMASK .req r6 - PF_OFFS .req r7 - SRC_WIDTH_FIXED .req r8 - - ldr UNIT_X, [sp] - push {r4, r5, r6, r7, r8, r10} - mvn VXMASK, #((1 << bpp_shift) - 1) - ldr SRC_WIDTH_FIXED, [sp, #28] - - /* define helper macro */ - .macro scale_2_pixels - ldr&t TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X - str&t TMP1, [DST], #(1 << bpp_shift) -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - - ldr&t TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X - str&t TMP2, [DST], #(1 << bpp_shift) -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - .endm - - /* now do the scaling */ - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) - adds VX, VX, UNIT_X -9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - subs W, W, #(8 + prefetch_braking_distance) - blt 2f - /* calculate prefetch offset */ - mov PF_OFFS, #prefetch_distance - mla PF_OFFS, UNIT_X, PF_OFFS, VX -1: /* main loop, process 8 pixels per iteration with prefetch */ - pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] - add PF_OFFS, UNIT_X, lsl #3 - scale_2_pixels - scale_2_pixels - scale_2_pixels - scale_2_pixels - subs W, W, #8 - bge 1b -2: - subs W, W, #(4 - 8 - prefetch_braking_distance) - blt 2f -1: /* process the remaining pixels */ - scale_2_pixels - scale_2_pixels - subs W, W, #4 - bge 1b -2: - tst W, #2 - beq 2f - scale_2_pixels -2: - tst W, #1 - ldrne&t TMP1, [SRC, TMP1] - strne&t TMP1, [DST] - /* cleanup helper macro */ - .purgem scale_2_pixels - .unreq DST - .unreq SRC - .unreq W - .unreq VX - .unreq UNIT_X - .unreq TMP1 - .unreq TMP2 - .unreq VXMASK - .unreq PF_OFFS - .unreq SRC_WIDTH_FIXED - /* return */ - pop {r4, r5, r6, r7, r8, r10} - bx lr -.endfunc -.endm - -generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 - -generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 diff --git a/pixman/pixman/pixman-arm-simd-asm.h b/pixman/pixman/pixman-arm-simd-asm.h new file mode 100644 index 000000000..65436062b --- /dev/null +++ b/pixman/pixman/pixman-arm-simd-asm.h @@ -0,0 +1,908 @@ +/* + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + * + */ + +/* + * Because the alignment of pixel data to cachelines, and even the number of + * cachelines per row can vary from row to row, and because of the need to + * preload each scanline once and only once, this prefetch strategy treats + * each row of pixels independently. When a pixel row is long enough, there + * are three distinct phases of prefetch: + * * an inner loop section, where each time a cacheline of data is + * processed, another cacheline is preloaded (the exact distance ahead is + * determined empirically using profiling results from lowlevel-blt-bench) + * * a leading section, where enough cachelines are preloaded to ensure no + * cachelines escape being preloaded when the inner loop starts + * * a trailing section, where a limited number (0 or more) of cachelines + * are preloaded to deal with data (if any) that hangs off the end of the + * last iteration of the inner loop, plus any trailing bytes that were not + * enough to make up one whole iteration of the inner loop + * + * There are (in general) three distinct code paths, selected between + * depending upon how long the pixel row is. If it is long enough that there + * is at least one iteration of the inner loop (as described above) then + * this is described as the "wide" case. If it is shorter than that, but + * there are still enough bytes output that there is at least one 16-byte- + * long, 16-byte-aligned write to the destination (the optimum type of + * write), then this is the "medium" case. If it is not even this long, then + * this is the "narrow" case, and there is no attempt to align writes to + * 16-byte boundaries. In the "medium" and "narrow" cases, all the + * cachelines containing data from the pixel row are prefetched up-front. + */ + +/* + * Determine whether we put the arguments on the stack for debugging. + */ +#undef DEBUG_PARAMS + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_COND_EXEC, 0 +.set FLAG_BRANCH_OVER, 2 +.set FLAG_PROCESS_PRESERVES_PSR, 0 +.set FLAG_PROCESS_CORRUPTS_PSR, 4 +.set FLAG_PROCESS_DOESNT_STORE, 0 +.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ +.set FLAG_NO_SPILL_LINE_VARS, 0 +.set FLAG_SPILL_LINE_VARS_WIDE, 16 +.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 +.set FLAG_SPILL_LINE_VARS, 48 +.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 + +/* + * Offset into stack where mask and source pointer/stride can be accessed. + */ +#ifdef DEBUG_PARAMS +.set ARGS_STACK_OFFSET, (9*4+9*4) +#else +.set ARGS_STACK_OFFSET, (9*4) +#endif + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 +.set PREFETCH_TYPE_STANDARD, 1 + +/* + * Definitions of macros for load/store of pixel data. + */ + +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 + .if numbytes == 16 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + op&r&cond WK®2, [base], #4 + op&r&cond WK®3, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} + .endif + .elseif numbytes == 8 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1} + .endif + .elseif numbytes == 4 + op&r&cond WK®0, [base], #4 + .elseif numbytes == 2 + op&r&cond&h WK®0, [base], #2 + .elseif numbytes == 1 + op&r&cond&b WK®0, [base], #1 + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base + .if numbytes == 16 + stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} + .elseif numbytes == 8 + stm&cond&db base, {WK®0,WK®1} + .elseif numbytes == 4 + str&cond WK®0, [base, #-4] + .elseif numbytes == 2 + str&cond&h WK®0, [base, #-2] + .elseif numbytes == 1 + str&cond&b WK®0, [base, #-1] + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixld cond, numbytes, firstreg, base, unaligned + pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned +.endm + +.macro pixst cond, numbytes, firstreg, base + .if (flags) & FLAG_DST_READWRITE + pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .else + pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .endif +.endm + +.macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) + a x + .endif +.endm + + +.macro preload_leading_step1 bpp, ptr, base +/* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if bpp > 0 + PF bic, ptr, base, #31 + .set OFFSET, 0 + .rept prefetch_distance+1 + PF pld, [ptr, #OFFSET] + .set OFFSET, OFFSET+32 + .endr + .endif +.endm + +.macro preload_leading_step2 bpp, bpp_shift, ptr, base +/* However, if the destination is not 16-byte aligned, we may need to + * preload more cache lines than that. The question we need to ask is: + * are the bytes corresponding to the leading pixels more than the amount + * by which the source pointer will be rounded down for preloading, and if + * so, by how many cache lines? Effectively, we want to calculate + * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp + * inner_loop_offset = (src+leading_bytes)&31 + * extra_needed = leading_bytes - inner_loop_offset + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only + * possible when there are 4 src bytes for every 1 dst byte). + */ + .if bpp > 0 + .ifc base,DST + /* The test can be simplified further when preloading the destination */ + PF tst, base, #16 + PF beq, 61f + .else + .if bpp/dst_w_bpp == 4 + PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift + PF and, SCRATCH, SCRATCH, #31 + PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift + PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ + PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ + PF bcs, 61f + PF bpl, 60f + PF pld, [ptr, #32*(prefetch_distance+2)] + .else + PF mov, SCRATCH, base, lsl #32-5 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF bls, 61f + .endif + .endif +60: PF pld, [ptr, #32*(prefetch_distance+1)] +61: + .endif +.endm + +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) +.macro preload_middle bpp, base, scratch_holds_offset + .if bpp > 0 + /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) + .if scratch_holds_offset + PF pld, [base, SCRATCH] + .else + PF bic, SCRATCH, base, #31 + PF pld, [SCRATCH, #32*prefetch_distance] + .endif + .endif + .endif +.endm + +.macro preload_trailing bpp, bpp_shift, base + .if bpp > 0 + .if bpp*pix_per_block > 256 + /* Calculations are more complex if more than one fetch per block */ + PF and, WK1, base, #31 + PF add, WK1, WK1, WK0, lsl #bpp_shift + PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) + PF bic, SCRATCH, base, #31 +80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] + PF add, SCRATCH, SCRATCH, #32 + PF subs, WK1, WK1, #32 + PF bhi, 80b + .else + /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ + PF mov, SCRATCH, base, lsl #32-5 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift + PF adceqs, SCRATCH, SCRATCH, #0 + /* The instruction above has two effects: ensures Z is only + * set if C was clear (so Z indicates that both shifted quantities + * were 0), and clears C if Z was set (so C indicates that the sum + * of the shifted quantities was greater and not equal to 32) */ + PF beq, 82f + PF bic, SCRATCH, base, #31 + PF bcc, 81f + PF pld, [SCRATCH, #32*(prefetch_distance+2)] +81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] +82: + .endif + .endif +.endm + + +.macro preload_line narrow_case, bpp, bpp_shift, base +/* "narrow_case" - just means that the macro was invoked from the "narrow" + * code path rather than the "medium" one - because in the narrow case, + * the row of pixels is known to output no more than 30 bytes, then + * (assuming the source pixels are no wider than the the destination + * pixels) they cannot possibly straddle more than 2 32-byte cachelines, + * meaning there's no need for a loop. + * "bpp" - number of bits per pixel in the channel (source, mask or + * destination) that's being preloaded, or 0 if this channel is not used + * for reading + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) + * "base" - base address register of channel to preload (SRC, MASK or DST) + */ + .if bpp > 0 + .if narrow_case && (bpp <= dst_w_bpp) + /* In these cases, each line for each channel is in either 1 or 2 cache lines */ + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, LSL #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 90f + PF pld, [WK1] +90: + .else + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, lsl #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 92f +91: PF add, WK0, WK0, #32 + PF cmp, WK0, WK1 + PF pld, [WK0] + PF bne, 91b +92: + .endif + .endif +.endm + + +.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond X, X, #8*numbytes/dst_w_bpp + .endif + process_tail cond, numbytes, firstreg + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst cond, numbytes, firstreg, DST + .endif +.endm + +.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_BRANCH_OVER + .ifc cond,mi + bpl 100f + .endif + .ifc cond,cs + bcc 100f + .endif + .ifc cond,ne + beq 100f + .endif + conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +100: + .else + conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .endif +.endm + +.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) + /* Can't interleave reads and writes */ + test + conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_PROCESS_CORRUPTS_PSR + test + .endif + conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx + .else + /* Can interleave reads and writes for better scheduling */ + test + process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 + process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond1 X, X, #8*numbytes1/dst_w_bpp + sub&cond2 X, X, #8*numbytes2/dst_w_bpp + .endif + process_tail cond1, numbytes1, firstreg1 + process_tail cond2, numbytes2, firstreg2 + pixst cond1, numbytes1, firstreg1, DST + pixst cond2, numbytes2, firstreg2, DST + .endif +.endm + + +.macro test_bits_1_0_ptr + movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ +.endm + +.macro test_bits_3_2_ptr + movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ +.endm + +.macro leading_15bytes process_head, process_tail + /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ + /* Use unaligned loads in all cases for simplicity */ + .if dst_w_bpp == 8 + conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 + .elseif dst_w_bpp == 16 + test_bits_1_0_ptr + conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 + .endif + conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 +.endm + +.macro test_bits_3_2_pix + movs SCRATCH, X, lsl #dst_bpp_shift+32-3 +.endm + +.macro test_bits_1_0_pix + .if dst_w_bpp == 8 + movs SCRATCH, X, lsl #dst_bpp_shift+32-1 + .else + movs SCRATCH, X, lsr #1 + .endif +.endm + +.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 + .if dst_w_bpp == 16 + test_bits_1_0_pix + conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 + .elseif dst_w_bpp == 8 + conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 + .endif +.endm + + +.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +110: + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ + .rept pix_per_block*dst_w_bpp/128 + process_head , 16, 0, unaligned_src, unaligned_mask, 1 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle src_bpp, SRC, 1 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle mask_bpp, MASK, 1 + .else + preload_middle src_bpp, SRC, 0 + preload_middle mask_bpp, MASK, 0 + .endif + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) + /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that + * destination prefetches are 32-byte aligned. It's also the easiest channel to offset + * preloads for, to achieve staggered prefetches for multiple channels, because there are + * always two STMs per prefetch, so there is always an opposite STM on which to put the + * preload. Note, no need to BIC the base register here */ + PF pld, [DST, #32*prefetch_distance - dst_alignment] + .endif + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + .set SUBBLOCK, SUBBLOCK+1 + .endr + subs X, X, #pix_per_block + bhs 110b +.endm + +.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask + /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ + .if dst_r_bpp > 0 + tst DST, #16 + bne 111f + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + b 112f +111: + .endif + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 +112: + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) + PF and, WK0, X, #pix_per_block-1 + .endif + preload_trailing src_bpp, src_bpp_shift, SRC + preload_trailing mask_bpp, mask_bpp_shift, MASK + preload_trailing dst_r_bpp, dst_bpp_shift, DST + add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp + /* The remainder of the line is handled identically to the medium case */ + medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask +.endm + +.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask +120: + process_head , 16, 0, unaligned_src, unaligned_mask, 0 + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + subs X, X, #128/dst_w_bpp + bhs 120b + /* Trailing pixels */ + tst X, #128/dst_w_bpp - 1 + beq exit_label + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask + tst X, #16*8/dst_w_bpp + conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 + /* Trailing pixels */ + /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ + .if mask_bpp == 8 || mask_bpp == 16 + tst MASK, #3 + bne 141f + .endif + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 140f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 0 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +140: + action process_head, process_tail, process_inner_loop, exit_label, 1, 0 + .endif + .if mask_bpp == 8 || mask_bpp == 16 + b exit_label +141: + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 142f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 1 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +142: + action process_head, process_tail, process_inner_loop, exit_label, 1, 1 + .endif + .endif +.endm + + +.macro end_of_line restore_x, vars_spilled, loop_label, last_one + .if vars_spilled + /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ + /* This is ldmia sp,{} */ + .word 0xE89D0000 | LINE_SAVED_REGS + .endif + subs Y, Y, #1 + .if vars_spilled + .if (LINE_SAVED_REGS) & (1<<1) + str Y, [sp] + .endif + .endif + add DST, DST, STRIDE_D + .if src_bpp > 0 + add SRC, SRC, STRIDE_S + .endif + .if mask_bpp > 0 + add MASK, MASK, STRIDE_M + .endif + .if restore_x + mov X, ORIG_W + .endif + bhs loop_label + .ifc "last_one","" + .if vars_spilled + b 197f + .else + b 198f + .endif + .else + .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) + b 198f + .endif + .endif +.endm + + +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags_, \ + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + + .func fname + .global fname + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set flags, flags_ + .set prefetch_distance, prefetch_distance_ + +/* + * Select prefetch type for this function. + */ + .if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .else + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD + .endif + + .if src_bpp == 32 + .set src_bpp_shift, 2 + .elseif src_bpp == 24 + .set src_bpp_shift, 0 + .elseif src_bpp == 16 + .set src_bpp_shift, 1 + .elseif src_bpp == 8 + .set src_bpp_shift, 0 + .elseif src_bpp == 0 + .set src_bpp_shift, -1 + .else + .error "requested src bpp (src_bpp) is not supported" + .endif + + .if mask_bpp == 32 + .set mask_bpp_shift, 2 + .elseif mask_bpp == 24 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 8 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 0 + .set mask_bpp_shift, -1 + .else + .error "requested mask bpp (mask_bpp) is not supported" + .endif + + .if dst_w_bpp == 32 + .set dst_bpp_shift, 2 + .elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + + .if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif + + .set pix_per_block, 16*8/dst_w_bpp + .if src_bpp != 0 + .if 32*8/src_bpp > pix_per_block + .set pix_per_block, 32*8/src_bpp + .endif + .endif + .if mask_bpp != 0 + .if 32*8/mask_bpp > pix_per_block + .set pix_per_block, 32*8/mask_bpp + .endif + .endif + .if dst_r_bpp != 0 + .if 32*8/dst_r_bpp > pix_per_block + .set pix_per_block, 32*8/dst_r_bpp + .endif + .endif + +/* The standard entry conditions set up by pixman-arm-common.h are: + * r0 = width (pixels) + * r1 = height (rows) + * r2 = pointer to top-left pixel of destination + * r3 = destination stride (pixels) + * [sp] = source pixel value, or pointer to top-left pixel of source + * [sp,#4] = 0 or source stride (pixels) + * The following arguments are unused for non-mask operations + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask + * [sp,#12] = 0 or mask stride (pixels) + */ + +/* + * Assign symbolic names to registers + */ + X .req r0 /* pixels to go on this line */ + Y .req r1 /* lines to go */ + DST .req r2 /* destination pixel pointer */ + STRIDE_D .req r3 /* destination stride (bytes, minus width) */ + SRC .req r4 /* source pixel pointer */ + STRIDE_S .req r5 /* source stride (bytes, minus width) */ + MASK .req r6 /* mask pixel pointer (if applicable) */ + STRIDE_M .req r7 /* mask stride (bytes, minus width) */ + WK0 .req r8 /* pixel data registers */ + WK1 .req r9 + WK2 .req r10 + WK3 .req r11 + SCRATCH .req r12 + ORIG_W .req r14 /* width (pixels) */ + +fname: + push {r4-r11, lr} /* save all registers */ + + subs Y, Y, #1 + blo 199f + +#ifdef DEBUG_PARAMS + sub sp, sp, #9*4 +#endif + + .if src_bpp > 0 + ldr SRC, [sp, #ARGS_STACK_OFFSET] + ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] + .endif + .if mask_bpp > 0 + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] + .endif + +#ifdef DEBUG_PARAMS + add Y, Y, #1 + stmia sp, {r0-r7,pc} + sub Y, Y, #1 +#endif + + init + + lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ + sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift + .if src_bpp > 0 + lsl STRIDE_S, #src_bpp_shift + sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift + .endif + .if mask_bpp > 0 + lsl STRIDE_M, #mask_bpp_shift + sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift + .endif + + /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ + cmp X, #2*16*8/dst_w_bpp - 1 + blo 170f + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ + /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ + cmp X, #(prefetch_distance+3)*pix_per_block - 1 + blo 160f + + /* Wide case */ + /* Adjust X so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub X, X, #(prefetch_distance+2)*pix_per_block + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +151: /* New line */ + newline + preload_leading_step1 src_bpp, WK1, SRC + preload_leading_step1 mask_bpp, WK2, MASK + preload_leading_step1 dst_r_bpp, WK3, DST + + tst DST, #15 + beq 154f + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ + .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) + PF and, WK0, WK0, #15 + .endif + + preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC + preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + + leading_15bytes process_head, process_tail + +154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, SRC, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, MASK, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .endif + .ifc "process_inner_loop","" + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f + .else + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f + .endif + +157: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b + .endif + + .ltorg + +160: /* Medium case */ + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +161: /* New line */ + newline + preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 0, mask_bpp, mask_bpp_shift, MASK + preload_line 0, dst_r_bpp, dst_bpp_shift, DST + + sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ + tst DST, #15 + beq 164f + rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ + + leading_15bytes process_head, process_tail + +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f + +167: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b + + .ltorg + +170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + .if dst_w_bpp < 32 + mov ORIG_W, X + .endif + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif +171: /* New line */ + newline + preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 1, mask_bpp, mask_bpp_shift, MASK + preload_line 1, dst_r_bpp, dst_bpp_shift, DST + + .if dst_w_bpp == 8 + tst DST, #3 + beq 174f +172: subs X, X, #1 + blo 177f + process_head , 1, 0, 1, 1, 0 + process_tail , 1, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 1, 0, DST + .endif + tst DST, #3 + bne 172b + .elseif dst_w_bpp == 16 + tst DST, #2 + beq 174f + subs X, X, #1 + blo 177f + process_head , 2, 0, 1, 1, 0 + process_tail , 2, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 2, 0, DST + .endif + .endif + +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f + +177: /* Check for another line */ + end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + +197: + .if (flags) & FLAG_SPILL_LINE_VARS + add sp, sp, #LINE_SAVED_REG_COUNT*4 + .endif +198: + cleanup + +#ifdef DEBUG_PARAMS + add sp, sp, #9*4 /* junk the debug copy of arguments */ +#endif +199: + pop {r4-r11, pc} /* exit */ + + .ltorg + + .unreq X + .unreq Y + .unreq DST + .unreq STRIDE_D + .unreq SRC + .unreq STRIDE_S + .unreq MASK + .unreq STRIDE_M + .unreq WK0 + .unreq WK1 + .unreq WK2 + .unreq WK3 + .unreq SCRATCH + .unreq ORIG_W + .endfunc +.endm + +.macro line_saved_regs x:vararg + .set LINE_SAVED_REGS, 0 + .set LINE_SAVED_REG_COUNT, 0 + .irp SAVED_REG,x + .ifc "SAVED_REG","Y" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_D" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_S" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_M" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","ORIG_W" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .endr +.endm + +.macro nop_macro x:vararg +.endm diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c index 94f9a0caf..af062e19d 100644 --- a/pixman/pixman/pixman-arm-simd.c +++ b/pixman/pixman/pixman-arm-simd.c @@ -31,369 +31,191 @@ #include "pixman-arm-common.h" #include "pixman-inlines.h" -#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ - -void -pixman_composite_add_8_8_asm_armv6 (int32_t width, - int32_t height, - uint8_t *dst_line, - int32_t dst_stride, - uint8_t *src_line, - int32_t src_stride) -{ - uint8_t *dst, *src; - int32_t w; - uint8_t s, d; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - - /* ensure both src and dst are properly aligned before doing 32 bit reads - * we'll stay in this loop if src and dst have differing alignments - */ - while (w && (((uintptr_t)dst & 3) || ((uintptr_t)src & 3))) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; - - dst++; - src++; - w--; - } +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888, + uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565, + uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8, + uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888, + uint16_t, 1, uint32_t, 1) - while (w >= 4) - { - asm ("uqadd8 %0, %1, %2" - : "=r" (*(uint32_t*)dst) - : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); - dst += 4; - src += 4; - w -= 4; - } +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, + uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, + uint32_t, 1, uint32_t, 1) - while (w) - { - s = *src; - d = *dst; - asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); - *dst = d; +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, + uint32_t, 1, uint32_t, 1) - dst++; - src++; - w--; - } - } +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, + uint8_t, 1, uint32_t, 1) -} +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, + uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, + uint32_t, uint32_t) void -pixman_composite_over_8888_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride) -{ - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t upper_component_mask = 0xff00ff00; - uint32_t alpha_mask = 0xff; - - while (height--) - { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; +pixman_composite_src_n_8888_asm_armv6 (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t src); -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - "ldr r4, [%[dest]] \n\t" - -#else - "ldr r4, [%[dest]] \n\t" - - /* = 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" -#endif - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* multiply by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - /* recombine the 0xff00ff00 bytes of r6 and r7 */ - "and r7, r7, %[upper_component_mask]\n\t" - "uxtab16 r6, r7, r6, ror #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory" - ); - } -} +void +pixman_composite_src_n_0565_asm_armv6 (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint16_t src); void -pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t *src_line, - int32_t src_stride, - uint32_t mask) +pixman_composite_src_n_8_asm_armv6 (int32_t w, + int32_t h, + uint8_t *dst, + int32_t dst_stride, + uint8_t src); + +static pixman_bool_t +arm_simd_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, /* in 32-bit words */ + int bpp, + int x, + int y, + int width, + int height, + uint32_t _xor) { - uint32_t *dst; - uint32_t *src; - int32_t w; - uint32_t component_half = 0x800080; - uint32_t alpha_mask = 0xff; - - mask = (mask) >> 24; + /* stride is always multiple of 32bit units in pixman */ + uint32_t byte_stride = stride * sizeof(uint32_t); - while (height--) + switch (bpp) { - dst = dst_line; - dst_line += dst_stride; - src = src_line; - src_line += src_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load src */ - "ldr r5, [%[src]], #4\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - "uxtb16 r6, r5\n\t" - "uxtb16 r7, r5, ror #8\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, %[mask_alpha], %[component_half]\n\t" - "mla r7, r7, %[mask_alpha], %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* 255 - alpha */ - "sub r8, %[alpha_mask], r5, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) - : [component_half] "r" (component_half), [mask_alpha] "r" (mask), - [alpha_mask] "r" (alpha_mask) - : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" - ); + case 8: + pixman_composite_src_n_8_asm_armv6 ( + width, + height, + (uint8_t *)(((char *) bits) + y * byte_stride + x), + byte_stride, + _xor & 0xff); + return TRUE; + case 16: + pixman_composite_src_n_0565_asm_armv6 ( + width, + height, + (uint16_t *)(((char *) bits) + y * byte_stride + x * 2), + byte_stride / 2, + _xor & 0xffff); + return TRUE; + case 32: + pixman_composite_src_n_8888_asm_armv6 ( + width, + height, + (uint32_t *)(((char *) bits) + y * byte_stride + x * 4), + byte_stride / 4, + _xor); + return TRUE; + default: + return FALSE; } } -void -pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, - int32_t height, - uint32_t *dst_line, - int32_t dst_stride, - uint32_t src, - int32_t unused, - uint8_t *mask_line, - int32_t mask_stride) +static pixman_bool_t +arm_simd_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, /* in 32-bit words */ + int dst_stride, /* in 32-bit words */ + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dest_x, + int dest_y, + int width, + int height) { - uint32_t srca; - uint32_t *dst; - uint8_t *mask; - int32_t w; - - srca = src >> 24; - - uint32_t component_mask = 0xff00ff; - uint32_t component_half = 0x800080; - - uint32_t src_hi = (src >> 8) & component_mask; - uint32_t src_lo = src & component_mask; + if (src_bpp != dst_bpp) + return FALSE; - while (height--) + switch (src_bpp) { - dst = dst_line; - dst_line += dst_stride; - mask = mask_line; - mask_line += mask_stride; - w = width; - -/* #define inner_branch */ - asm volatile ( - "cmp %[w], #0\n\t" - "beq 2f\n\t" - "1:\n\t" - /* load mask */ - "ldrb r5, [%[mask]], #1\n\t" -#ifdef inner_branch - /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. - * The 0x0 case also allows us to avoid doing an unecessary data - * write which is more valuable so we only check for that - */ - "cmp r5, #0\n\t" - "beq 3f\n\t" - -#endif - "ldr r4, [%[dest]] \n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, %[src_lo], r5, %[component_half]\n\t" - "mla r7, %[src_hi], r5, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r5, r6, r7, lsl #8\n\t" - - "uxtb16 r6, r4\n\t" - "uxtb16 r7, r4, ror #8\n\t" - - /* we could simplify this to use 'sub' if we were - * willing to give up a register for alpha_mask - */ - "mvn r8, r5\n\t" - "mov r8, r8, lsr #24\n\t" - - /* multiply by alpha (r8) then by 257 and divide by 65536 */ - "mla r6, r6, r8, %[component_half]\n\t" - "mla r7, r7, r8, %[component_half]\n\t" - - "uxtab16 r6, r6, r6, ror #8\n\t" - "uxtab16 r7, r7, r7, ror #8\n\t" - - "uxtb16 r6, r6, ror #8\n\t" - "uxtb16 r7, r7, ror #8\n\t" - - /* recombine */ - "orr r6, r6, r7, lsl #8\n\t" - - "uqadd8 r5, r6, r5\n\t" - -#ifdef inner_branch - "3:\n\t" - -#endif - "str r5, [%[dest]], #4\n\t" - /* increment counter and jmp to top */ - "subs %[w], %[w], #1\n\t" - "bne 1b\n\t" - "2:\n\t" - : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) - : [component_half] "r" (component_half), - [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) - : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); + case 8: + pixman_composite_src_8_8_asm_armv6 ( + width, height, + (uint8_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4, + (uint8_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 1), src_stride * 4); + return TRUE; + case 16: + pixman_composite_src_0565_0565_asm_armv6 ( + width, height, + (uint16_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2, + (uint16_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 2), src_stride * 2); + return TRUE; + case 32: + pixman_composite_src_8888_8888_asm_armv6 ( + width, height, + (uint32_t *)(((char *) dst_bits) + + dest_y * dst_stride * 4 + dest_x * 4), dst_stride, + (uint32_t *)(((char *) src_bits) + + src_y * src_stride * 4 + src_x * 4), src_stride); + return TRUE; + default: + return FALSE; } } -#endif - -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, - uint8_t, 1, uint8_t, 1) -PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, - uint32_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, - uint8_t, 1, uint32_t, 1) - -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, - uint16_t, uint16_t) -PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, - uint32_t, uint32_t) - static const pixman_fast_path_t arm_simd_fast_paths[] = { + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888), + + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), + PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), + + PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8), + PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8), + + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), @@ -428,5 +250,8 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); + imp->blt = arm_simd_blt; + imp->fill = arm_simd_fill; + return imp; } diff --git a/pixman/pixman/pixman-combine-float.c b/pixman/pixman/pixman-combine-float.c index c916df8a3..06ce2037e 100644 --- a/pixman/pixman/pixman-combine-float.c +++ b/pixman/pixman/pixman-combine-float.c @@ -653,10 +653,12 @@ clip_color (rgb_t *color, float a) float l = get_lum (color); float n = channel_min (color); float x = channel_max (color); + float t; if (n < 0.0f) { - if ((l - n) < 4 * FLT_EPSILON) + t = l - n; + if (IS_ZERO (t)) { color->r = 0.0f; color->g = 0.0f; @@ -664,14 +666,15 @@ clip_color (rgb_t *color, float a) } else { - color->r = l + (((color->r - l) * l) / (l - n)); - color->g = l + (((color->g - l) * l) / (l - n)); - color->b = l + (((color->b - l) * l) / (l - n)); + color->r = l + (((color->r - l) * l) / t); + color->g = l + (((color->g - l) * l) / t); + color->b = l + (((color->b - l) * l) / t); } } if (x > a) { - if ((x - l) < 4 * FLT_EPSILON) + t = x - l; + if (IS_ZERO (t)) { color->r = a; color->g = a; @@ -679,9 +682,9 @@ clip_color (rgb_t *color, float a) } else { - color->r = l + (((color->r - l) * (a - l) / (x - l))); - color->g = l + (((color->g - l) * (a - l) / (x - l))); - color->b = l + (((color->b - l) * (a - l) / (x - l))); + color->r = l + (((color->r - l) * (a - l) / t)); + color->g = l + (((color->g - l) * (a - l) / t)); + color->b = l + (((color->b - l) * (a - l) / t)); } } } @@ -702,6 +705,7 @@ static void set_sat (rgb_t *src, float sat) { float *max, *mid, *min; + float t; if (src->r > src->g) { @@ -752,14 +756,16 @@ set_sat (rgb_t *src, float sat) } } - if (*max > *min) + t = *max - *min; + + if (IS_ZERO (t)) { - *mid = (((*mid - *min) * sat) / (*max - *min)); - *max = sat; + *mid = *max = 0.0f; } else { - *mid = *max = 0.0f; + *mid = ((*mid - *min) * sat) / t; + *max = sat; } *min = 0.0f; diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c index 7336fa0d5..8e80b4280 100644 --- a/pixman/test/lowlevel-blt-bench.c +++ b/pixman/test/lowlevel-blt-bench.c @@ -33,6 +33,14 @@ #define L1CACHE_SIZE (8 * 1024) #define L2CACHE_SIZE (128 * 1024) +/* This is applied to both L1 and L2 tests - alternatively, you could + * parameterise bench_L or split it into two functions. It could be + * read at runtime on some architectures, but it only really matters + * that it's a number that's an integer divisor of both cacheline + * lengths, and further, it only really matters for caches that don't + * do allocate0on-write. */ +#define CACHELINE_LENGTH (32) /* bytes */ + #define WIDTH 1920 #define HEIGHT 1080 #define BUFSIZE (WIDTH * HEIGHT * 4) @@ -168,18 +176,29 @@ bench_L (pixman_op_t op, int width, int lines_count) { - int64_t i, j; + int64_t i, j, k; int x = 0; int q = 0; volatile int qx; for (i = 0; i < n; i++) { - /* touch destination buffer to fetch it into L1 cache */ - for (j = 0; j < width + 64; j += 16) { - q += dst[j]; - q += src[j]; - } + /* For caches without allocate-on-write, we need to force the + * destination buffer back into the cache on each iteration, + * otherwise if they are evicted during the test, they remain + * uncached. This doesn't matter for tests which read the + * destination buffer, or for caches that do allocate-on-write, + * but in those cases this loop just adds constant time, which + * should be successfully cancelled out. + */ + for (j = 0; j < lines_count; j++) + { + for (k = 0; k < width + 62; k += CACHELINE_LENGTH / sizeof *dst) + { + q += dst[j * WIDTH + k]; + } + q += dst[j * WIDTH + width + 62]; + } if (++x >= 64) x = 0; call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count); diff --git a/pixman/test/stress-test.c b/pixman/test/stress-test.c index 9d949afad..1f03c7543 100644 --- a/pixman/test/stress-test.c +++ b/pixman/test/stress-test.c @@ -205,13 +205,21 @@ rand_y (pixman_image_t *image) return log_rand (); } +typedef enum +{ + DONT_CARE, + PREFER_ALPHA, + REQUIRE_ALPHA +} alpha_preference_t; + static pixman_format_code_t -random_format (pixman_bool_t prefer_alpha) +random_format (alpha_preference_t alpha) { pixman_format_code_t format; int n = prng_rand_n (ARRAY_LENGTH (image_formats)); - if (prefer_alpha && prng_rand_n (4) != 0) + if (alpha >= PREFER_ALPHA && + (alpha == REQUIRE_ALPHA || prng_rand_n (4) != 0)) { do { @@ -227,7 +235,7 @@ random_format (pixman_bool_t prefer_alpha) } static pixman_image_t * -create_random_bits_image (pixman_bool_t prefer_alpha) +create_random_bits_image (alpha_preference_t alpha_preference) { pixman_format_code_t format; pixman_indexed_t *indexed; @@ -241,7 +249,7 @@ create_random_bits_image (pixman_bool_t prefer_alpha) int n_coefficients = 0; /* format */ - format = random_format (prefer_alpha); + format = random_format (alpha_preference); indexed = NULL; if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR) @@ -410,7 +418,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map) pixman_image_t *alpha_map; int16_t x, y; - alpha_map = create_random_bits_image (FALSE); + alpha_map = create_random_bits_image (DONT_CARE); if (alpha_map) { @@ -493,7 +501,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map) width = INT32_MAX - x; if (height + y < y) height = INT32_MAX - y; - + pixman_region32_union_rect ( ®ion, ®ion, x, y, width, height); } @@ -716,7 +724,7 @@ create_random_image (void) { default: case 0: - result = create_random_bits_image (FALSE); + result = create_random_bits_image (DONT_CARE); break; case 1: @@ -848,62 +856,66 @@ run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod) } source = mask = dest = NULL; - + prng_srand (seed); if (prng_rand_n (8) == 0) { int n_traps; pixman_trapezoid_t *trapezoids; + int p = prng_rand_n (3); - dest = create_random_bits_image (TRUE); + if (p == 0) + dest = create_random_bits_image (DONT_CARE); + else + dest = create_random_bits_image (REQUIRE_ALPHA); - if (dest) - { - set_general_properties (dest, TRUE); + if (!dest) + goto out; + + set_general_properties (dest, TRUE); + + if (!(trapezoids = create_random_trapezoids ( + &n_traps, dest->bits.width, dest->bits.height))) + { + goto out; + } - trapezoids = create_random_trapezoids ( - &n_traps, dest->bits.width, dest->bits.height); - - if (trapezoids) - { - switch (prng_rand_n (3)) - { - case 0: - pixman_rasterize_trapezoid ( - dest, &trapezoids[prng_rand_n (n_traps)], - rand_x (dest), rand_y (dest)); - break; - - case 1: - source = create_random_image (); - - if (source) - { - op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))]; - - pixman_composite_trapezoids ( - op, source, dest, - random_format (TRUE), - rand_x (source), rand_y (source), - rand_x (dest), rand_y (dest), - n_traps, trapezoids); - } - break; - - case 2: - pixman_add_trapezoids ( - dest, rand_x (dest), rand_y (dest), n_traps, trapezoids); - break; - } - - free (trapezoids); - } + switch (p) + { + case 0: + source = create_random_image (); + + if (source) + { + op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))]; + + pixman_composite_trapezoids ( + op, source, dest, + random_format (REQUIRE_ALPHA), + rand_x (source), rand_y (source), + rand_x (dest), rand_y (dest), + n_traps, trapezoids); + } + break; + + case 1: + pixman_rasterize_trapezoid ( + dest, &trapezoids[prng_rand_n (n_traps)], + rand_x (dest), rand_y (dest)); + break; + + case 2: + pixman_add_trapezoids ( + dest, rand_x (dest), rand_y (dest), n_traps, trapezoids); + break; } + + free (trapezoids); } else { - dest = create_random_bits_image (FALSE); + dest = create_random_bits_image (DONT_CARE); source = create_random_image (); mask = create_random_image (); @@ -917,16 +929,17 @@ run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod) source, mask, dest, rand_x (source), rand_y (source), rand_x (mask), rand_y (mask), - 0, 0, + 0, 0, dest->bits.width, dest->bits.height); } } - if (source) - pixman_image_unref (source); - if (mask) - pixman_image_unref (mask); +out: + if (source) + pixman_image_unref (source); + if (mask) + pixman_image_unref (mask); if (dest) pixman_image_unref (dest); } |