/* * Copyright © 2008 Mozilla Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of Mozilla Corporation not be used in * advertising or publicity pertaining to distribution of the software without * specific, written prior permission. Mozilla Corporation makes no * representations about the suitability of this software for any purpose. It * is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Jeff Muizelaar (jeff@infidigm.net) * */ #ifdef HAVE_CONFIG_H #include #endif #include "pixman-private.h" #include "pixman-arm-common.h" #include "pixman-fast-path.h" #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ void pixman_composite_add_8_8_asm_armv6 (int32_t width, int32_t height, uint8_t *dst_line, int32_t dst_stride, uint8_t *src_line, int32_t src_stride) { uint8_t *dst, *src; int32_t w; uint8_t s, d; while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* ensure both src and dst are properly aligned before doing 32 bit reads * we'll stay in this loop if src and dst have differing alignments */ while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3))) { s = *src; d = *dst; asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); *dst = d; dst++; src++; w--; } while (w >= 4) { asm ("uqadd8 %0, %1, %2" : "=r" (*(uint32_t*)dst) : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst)); dst += 4; src += 4; w -= 4; } while (w) { s = *src; d = *dst; asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s)); *dst = d; dst++; src++; w--; } } } void pixman_composite_over_8888_8888_asm_armv6 (int32_t width, int32_t height, uint32_t *dst_line, int32_t dst_stride, uint32_t *src_line, int32_t src_stride) { uint32_t *dst; uint32_t *src; int32_t w; uint32_t component_half = 0x800080; uint32_t upper_component_mask = 0xff00ff00; uint32_t alpha_mask = 0xff; while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* #define inner_branch */ asm volatile ( "cmp %[w], #0\n\t" "beq 2f\n\t" "1:\n\t" /* load src */ "ldr r5, [%[src]], #4\n\t" #ifdef inner_branch /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. * The 0x0 case also allows us to avoid doing an unecessary data * write which is more valuable so we only check for that */ "cmp r5, #0\n\t" "beq 3f\n\t" /* = 255 - alpha */ "sub r8, %[alpha_mask], r5, lsr #24\n\t" "ldr r4, [%[dest]] \n\t" #else "ldr r4, [%[dest]] \n\t" /* = 255 - alpha */ "sub r8, %[alpha_mask], r5, lsr #24\n\t" #endif "uxtb16 r6, r4\n\t" "uxtb16 r7, r4, ror #8\n\t" /* multiply by 257 and divide by 65536 */ "mla r6, r6, r8, %[component_half]\n\t" "mla r7, r7, r8, %[component_half]\n\t" "uxtab16 r6, r6, r6, ror #8\n\t" "uxtab16 r7, r7, r7, ror #8\n\t" /* recombine the 0xff00ff00 bytes of r6 and r7 */ "and r7, r7, %[upper_component_mask]\n\t" "uxtab16 r6, r7, r6, ror #8\n\t" "uqadd8 r5, r6, r5\n\t" #ifdef inner_branch "3:\n\t" #endif "str r5, [%[dest]], #4\n\t" /* increment counter and jmp to top */ "subs %[w], %[w], #1\n\t" "bne 1b\n\t" "2:\n\t" : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask), [alpha_mask] "r" (alpha_mask) : "r4", "r5", "r6", "r7", "r8", "cc", "memory" ); } } void pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width, int32_t height, uint32_t *dst_line, int32_t dst_stride, uint32_t *src_line, int32_t src_stride, uint32_t mask) { uint32_t *dst; uint32_t *src; int32_t w; uint32_t component_half = 0x800080; uint32_t alpha_mask = 0xff; mask = (mask) >> 24; while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* #define inner_branch */ asm volatile ( "cmp %[w], #0\n\t" "beq 2f\n\t" "1:\n\t" /* load src */ "ldr r5, [%[src]], #4\n\t" #ifdef inner_branch /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. * The 0x0 case also allows us to avoid doing an unecessary data * write which is more valuable so we only check for that */ "cmp r5, #0\n\t" "beq 3f\n\t" #endif "ldr r4, [%[dest]] \n\t" "uxtb16 r6, r5\n\t" "uxtb16 r7, r5, ror #8\n\t" /* multiply by alpha (r8) then by 257 and divide by 65536 */ "mla r6, r6, %[mask_alpha], %[component_half]\n\t" "mla r7, r7, %[mask_alpha], %[component_half]\n\t" "uxtab16 r6, r6, r6, ror #8\n\t" "uxtab16 r7, r7, r7, ror #8\n\t" "uxtb16 r6, r6, ror #8\n\t" "uxtb16 r7, r7, ror #8\n\t" /* recombine */ "orr r5, r6, r7, lsl #8\n\t" "uxtb16 r6, r4\n\t" "uxtb16 r7, r4, ror #8\n\t" /* 255 - alpha */ "sub r8, %[alpha_mask], r5, lsr #24\n\t" /* multiply by alpha (r8) then by 257 and divide by 65536 */ "mla r6, r6, r8, %[component_half]\n\t" "mla r7, r7, r8, %[component_half]\n\t" "uxtab16 r6, r6, r6, ror #8\n\t" "uxtab16 r7, r7, r7, ror #8\n\t" "uxtb16 r6, r6, ror #8\n\t" "uxtb16 r7, r7, ror #8\n\t" /* recombine */ "orr r6, r6, r7, lsl #8\n\t" "uqadd8 r5, r6, r5\n\t" #ifdef inner_branch "3:\n\t" #endif "str r5, [%[dest]], #4\n\t" /* increment counter and jmp to top */ "subs %[w], %[w], #1\n\t" "bne 1b\n\t" "2:\n\t" : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src) : [component_half] "r" (component_half), [mask_alpha] "r" (mask), [alpha_mask] "r" (alpha_mask) : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory" ); } } void pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, int32_t height, uint32_t *dst_line, int32_t dst_stride, uint32_t src, int32_t unused, uint8_t *mask_line, int32_t mask_stride) { uint32_t srca; uint32_t *dst; uint8_t *mask; int32_t w; srca = src >> 24; uint32_t component_mask = 0xff00ff; uint32_t component_half = 0x800080; uint32_t src_hi = (src >> 8) & component_mask; uint32_t src_lo = src & component_mask; while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* #define inner_branch */ asm volatile ( "cmp %[w], #0\n\t" "beq 2f\n\t" "1:\n\t" /* load mask */ "ldrb r5, [%[mask]], #1\n\t" #ifdef inner_branch /* We can avoid doing the multiplication in two cases: 0x0 or 0xff. * The 0x0 case also allows us to avoid doing an unecessary data * write which is more valuable so we only check for that */ "cmp r5, #0\n\t" "beq 3f\n\t" #endif "ldr r4, [%[dest]] \n\t" /* multiply by alpha (r8) then by 257 and divide by 65536 */ "mla r6, %[src_lo], r5, %[component_half]\n\t" "mla r7, %[src_hi], r5, %[component_half]\n\t" "uxtab16 r6, r6, r6, ror #8\n\t" "uxtab16 r7, r7, r7, ror #8\n\t" "uxtb16 r6, r6, ror #8\n\t" "uxtb16 r7, r7, ror #8\n\t" /* recombine */ "orr r5, r6, r7, lsl #8\n\t" "uxtb16 r6, r4\n\t" "uxtb16 r7, r4, ror #8\n\t" /* we could simplify this to use 'sub' if we were * willing to give up a register for alpha_mask */ "mvn r8, r5\n\t" "mov r8, r8, lsr #24\n\t" /* multiply by alpha (r8) then by 257 and divide by 65536 */ "mla r6, r6, r8, %[component_half]\n\t" "mla r7, r7, r8, %[component_half]\n\t" "uxtab16 r6, r6, r6, ror #8\n\t" "uxtab16 r7, r7, r7, ror #8\n\t" "uxtb16 r6, r6, ror #8\n\t" "uxtb16 r7, r7, ror #8\n\t" /* recombine */ "orr r6, r6, r7, lsl #8\n\t" "uqadd8 r5, r6, r5\n\t" #ifdef inner_branch "3:\n\t" #endif "str r5, [%[dest]], #4\n\t" /* increment counter and jmp to top */ "subs %[w], %[w], #1\n\t" "bne 1b\n\t" "2:\n\t" : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask) : [component_half] "r" (component_half), [src_hi] "r" (src_hi), [src_lo] "r" (src_lo) : "r4", "r5", "r6", "r7", "r8", "cc", "memory"); } } #endif PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, uint32_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, uint16_t, uint16_t) PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, uint32_t, uint32_t) static const pixman_fast_path_t arm_simd_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888), PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888), { PIXMAN_OP_NONE }, }; pixman_implementation_t * _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); return imp; }