1 files changed, 486 insertions, 0 deletions
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
new file mode 100644
index 000000000..fb7bf3da8
--- /dev/null
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+static void
+arm_composite_add_8000_8000 (pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint8_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* ensure both src and dst are properly aligned before doing 32 bit reads
+	 * we'll stay in this loop if src and dst have differing alignments
+	 */
+	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    asm ("uqadd8 %0, %1, %2"
+		 : "=r" (*(uint32_t*)dst)
+		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+arm_composite_over_8888_8888 (pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t upper_component_mask = 0xff00ff00;
+    uint32_t alpha_mask = 0xff;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    "ldr r4, [%[dest]] \n\t"
+
+#else
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* multiply by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
+	    "and r7, r7, %[upper_component_mask]\n\t"
+	    "uxtab16 r6, r7, r6, ror #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+	    [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+	    );
+    }
+}
+
+static void
+arm_composite_over_8888_n_8888 (
+    pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
+{
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t alpha_mask = 0xff;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = (mask) >> 24;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    "uxtb16 r6, r5\n\t"
+	    "uxtb16 r7, r5, ror #8\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+	    [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+	    );
+    }
+}
+
+static void
+arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
+			     pixman_op_t               op,
+			     pixman_image_t *          src_image,
+			     pixman_image_t *          mask_image,
+			     pixman_image_t *          dst_image,
+			     int32_t                   src_x,
+			     int32_t                   src_y,
+			     int32_t                   mask_x,
+			     int32_t                   mask_y,
+			     int32_t                   dest_x,
+			     int32_t                   dest_y,
+			     int32_t                   width,
+			     int32_t                   height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t  *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    /* bail out if fully transparent */
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    uint32_t component_mask = 0xff00ff;
+    uint32_t component_half = 0x800080;
+
+    uint32_t src_hi = (src >> 8) & component_mask;
+    uint32_t src_lo = src & component_mask;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load mask */
+	    "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
+	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* we could simplify this to use 'sub' if we were
+	    * willing to give up a register for alpha_mask */
+	    "mvn r8, r5\n\t"
+	    "mov r8, r8, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+	    : [component_half] "r" (component_half),
+	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    }
+}
+
+static const pixman_fast_path_t arm_simd_fast_path_array[] =
+{
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
+
+    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_composite_add_8000_8000,     0 },
+
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888,     0 },
+
+    { PIXMAN_OP_NONE },
+};
+
+const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
+
+static void
+arm_simd_composite (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    pixman_image_t *         src,
+                    pixman_image_t *         mask,
+                    pixman_image_t *         dest,
+                    int32_t                  src_x,
+                    int32_t                  src_y,
+                    int32_t                  mask_x,
+                    int32_t                  mask_y,
+                    int32_t                  dest_x,
+                    int32_t                  dest_y,
+                    int32_t                  width,
+                    int32_t                  height)
+{
+    if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
+    {
+	return;
+    }
+
+    _pixman_implementation_composite (imp->delegate, op,
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (void)
+{
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general);
+
+    imp->composite = arm_simd_composite;
+
+    return imp;
+}
+