From 01df5d59e56a1b060568f8cad2e89f7eea22fc70 Mon Sep 17 00:00:00 2001
From: marha <marha@users.sourceforge.net>
Date: Mon, 29 Aug 2011 08:51:20 +0200
Subject: xwininfo libX11 libXmu libxcb mesa xserver xkeyboard-config git
 update 29 aug 2011

---
 pixman/pixman/Makefile.am          |  258 +--
 pixman/pixman/pixman-arm-common.h  |    2 +-
 pixman/pixman/pixman-arm-simd.c    |  864 +++++-----
 pixman/pixman/pixman-bits-image.c  | 3044 +++++++++++++++++-------------------
 pixman/pixman/pixman-fast-path.c   |    2 +-
 pixman/pixman/pixman-fast-path.h   | 1189 --------------
 pixman/pixman/pixman-inlines.h     | 1280 +++++++++++++++
 pixman/pixman/pixman-noop.c        |    2 +-
 pixman/pixman/pixman-private.h     |    7 +-
 pixman/pixman/pixman-region.c      |   64 +-
 pixman/pixman/pixman-sse2.c        |    2 +-
 pixman/pixman/pixman-utils.c       |   12 +-
 pixman/test/Makefile.am            |    2 +
 pixman/test/Makefile.win32         |    5 +-
 pixman/test/lowlevel-blt-bench.c   |    7 +-
 pixman/test/region-contains-test.c |  169 ++
 pixman/test/scaling-helpers-test.c |    2 +-
 pixman/test/utils.c                |   11 +-
 pixman/test/utils.h                |   16 +-
 19 files changed, 3572 insertions(+), 3366 deletions(-)
 delete mode 100644 pixman/pixman/pixman-fast-path.h
 create mode 100644 pixman/pixman/pixman-inlines.h
 create mode 100644 pixman/test/region-contains-test.c

(limited to 'pixman')

diff --git a/pixman/pixman/Makefile.am b/pixman/pixman/Makefile.am
index acc273145..44e6b17e4 100644
--- a/pixman/pixman/Makefile.am
+++ b/pixman/pixman/Makefile.am
@@ -1,129 +1,129 @@
-lib_LTLIBRARIES = libpixman-1.la
-libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
-libpixman_1_la_LIBADD = @PTHREAD_LIBS@ @DEP_LIBS@ -lm
-libpixman_1_la_SOURCES =			\
-	pixman.h				\
-	pixman-accessor.h			\
-	pixman-access.c				\
-	pixman-access-accessors.c		\
-	pixman-cpu.c				\
-	pixman-gradient-walker.c		\
-	pixman-region16.c			\
-	pixman-region32.c			\
-	pixman-compiler.h			\
-	pixman-private.h			\
-	pixman-image.c				\
-	pixman-implementation.c			\
-	pixman-combine32.c			\
-	pixman-combine32.h			\
-	pixman-combine64.c			\
-	pixman-combine64.h			\
-	pixman-general.c			\
-	pixman.c				\
-	pixman-noop.c				\
-	pixman-fast-path.c			\
-	pixman-fast-path.h			\
-	pixman-solid-fill.c			\
-	pixman-conical-gradient.c		\
-	pixman-linear-gradient.c		\
-	pixman-radial-gradient.c		\
-	pixman-bits-image.c			\
-	pixman-utils.c				\
-	pixman-edge.c				\
-	pixman-edge-accessors.c			\
-	pixman-edge-imp.h			\
-	pixman-trap.c				\
-	pixman-timer.c				\
-	pixman-matrix.c
-
-libpixmanincludedir = $(includedir)/pixman-1
-libpixmaninclude_HEADERS = pixman.h pixman-version.h
-noinst_LTLIBRARIES = 
-
-BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
-
-pixman-combine32.c : pixman-combine.c.template pixman-combine32.h make-combine.pl
-	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
-pixman-combine32.h : pixman-combine.h.template make-combine.pl
-	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
-
-pixman-combine64.c : pixman-combine.c.template pixman-combine64.h make-combine.pl
-	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
-pixman-combine64.h : pixman-combine.h.template make-combine.pl
-	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
-
-EXTRA_DIST = Makefile.win32 pixman-combine.c.template make-combine.pl pixman-region.c \
-	pixman-combine.h.template solaris-hwcap.mapfile
-CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
-
-# mmx code
-if USE_MMX
-noinst_LTLIBRARIES += libpixman-mmx.la
-libpixman_mmx_la_SOURCES = \
-	pixman-mmx.c
-libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
-libpixman_mmx_la_LIBADD = $(DEP_LIBS)
-libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-mmx.la
-
-ASM_CFLAGS_mmx=$(MMX_CFLAGS)
-endif
-
-# vmx code
-if USE_VMX
-noinst_LTLIBRARIES += libpixman-vmx.la
-libpixman_vmx_la_SOURCES = \
-	pixman-vmx.c \
-	pixman-combine32.h
-libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
-libpixman_vmx_la_LIBADD = $(DEP_LIBS)
-libpixman_1_la_LIBADD += libpixman-vmx.la
-
-ASM_CFLAGS_vmx=$(VMX_CFLAGS)
-endif
-
-# sse2 code
-if USE_SSE2
-noinst_LTLIBRARIES += libpixman-sse2.la
-libpixman_sse2_la_SOURCES = \
-	pixman-sse2.c
-libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
-libpixman_sse2_la_LIBADD = $(DEP_LIBS)
-libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-sse2.la
-
-ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
-endif
-
-# arm simd code
-if USE_ARM_SIMD
-noinst_LTLIBRARIES += libpixman-arm-simd.la
-libpixman_arm_simd_la_SOURCES = \
-	pixman-arm-simd.c	\
-	pixman-arm-common.h	\
-	pixman-arm-simd-asm.S
-libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
-libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
-libpixman_1_la_LIBADD += libpixman-arm-simd.la
-
-ASM_CFLAGS_arm_simd=
-endif
-
-# arm neon code
-if USE_ARM_NEON
-noinst_LTLIBRARIES += libpixman-arm-neon.la
-libpixman_arm_neon_la_SOURCES = \
-        pixman-arm-neon.c	\
-        pixman-arm-common.h	\
-        pixman-arm-neon-asm.S	\
-		pixman-arm-neon-asm-bilinear.S \
-        pixman-arm-neon-asm.h
-libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
-libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
-libpixman_1_la_LIBADD += libpixman-arm-neon.la
-
-ASM_CFLAGS_arm_neon=
-endif
-
-.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
-	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
+lib_LTLIBRARIES = libpixman-1.la
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
+libpixman_1_la_LIBADD = @PTHREAD_LIBS@ @DEP_LIBS@ -lm
+libpixman_1_la_SOURCES =			\
+	pixman.h				\
+	pixman-accessor.h			\
+	pixman-access.c				\
+	pixman-access-accessors.c		\
+	pixman-cpu.c				\
+	pixman-gradient-walker.c		\
+	pixman-region16.c			\
+	pixman-region32.c			\
+	pixman-compiler.h			\
+	pixman-private.h			\
+	pixman-image.c				\
+	pixman-implementation.c			\
+	pixman-combine32.c			\
+	pixman-combine32.h			\
+	pixman-combine64.c			\
+	pixman-combine64.h			\
+	pixman-general.c			\
+	pixman.c				\
+	pixman-noop.c				\
+	pixman-fast-path.c			\
+	pixman-solid-fill.c			\
+	pixman-conical-gradient.c		\
+	pixman-linear-gradient.c		\
+	pixman-radial-gradient.c		\
+	pixman-bits-image.c			\
+	pixman-utils.c				\
+	pixman-edge.c				\
+	pixman-edge-accessors.c			\
+	pixman-edge-imp.h			\
+	pixman-inlines.h			\
+	pixman-trap.c				\
+	pixman-timer.c				\
+	pixman-matrix.c
+
+libpixmanincludedir = $(includedir)/pixman-1
+libpixmaninclude_HEADERS = pixman.h pixman-version.h
+noinst_LTLIBRARIES = 
+
+BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
+
+pixman-combine32.c : pixman-combine.c.template pixman-combine32.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
+
+pixman-combine64.c : pixman-combine.c.template pixman-combine64.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
+
+EXTRA_DIST = Makefile.win32 pixman-combine.c.template make-combine.pl pixman-region.c \
+	pixman-combine.h.template solaris-hwcap.mapfile
+CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
+
+# mmx code
+if USE_MMX
+noinst_LTLIBRARIES += libpixman-mmx.la
+libpixman_mmx_la_SOURCES = \
+	pixman-mmx.c
+libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
+libpixman_mmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-mmx.la
+
+ASM_CFLAGS_mmx=$(MMX_CFLAGS)
+endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+	pixman-vmx.c \
+	pixman-combine32.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-vmx.la
+
+ASM_CFLAGS_vmx=$(VMX_CFLAGS)
+endif
+
+# sse2 code
+if USE_SSE2
+noinst_LTLIBRARIES += libpixman-sse2.la
+libpixman_sse2_la_SOURCES = \
+	pixman-sse2.c
+libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
+libpixman_sse2_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-sse2.la
+
+ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
+endif
+
+# arm simd code
+if USE_ARM_SIMD
+noinst_LTLIBRARIES += libpixman-arm-simd.la
+libpixman_arm_simd_la_SOURCES = \
+	pixman-arm-simd.c	\
+	pixman-arm-common.h	\
+	pixman-arm-simd-asm.S
+libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-simd.la
+
+ASM_CFLAGS_arm_simd=
+endif
+
+# arm neon code
+if USE_ARM_NEON
+noinst_LTLIBRARIES += libpixman-arm-neon.la
+libpixman_arm_neon_la_SOURCES = \
+        pixman-arm-neon.c	\
+        pixman-arm-common.h	\
+        pixman-arm-neon-asm.S	\
+		pixman-arm-neon-asm-bilinear.S \
+        pixman-arm-neon-asm.h
+libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-neon.la
+
+ASM_CFLAGS_arm_neon=
+endif
+
+.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
+	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/pixman/pixman-arm-common.h b/pixman/pixman/pixman-arm-common.h
index f1d212c84..f56264e8c 100644
--- a/pixman/pixman/pixman-arm-common.h
+++ b/pixman/pixman/pixman-arm-common.h
@@ -26,7 +26,7 @@
 #ifndef PIXMAN_ARM_COMMON_H
 #define PIXMAN_ARM_COMMON_H
 
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 /* Define some macros which can expand into proxy functions between
  * ARM assembly optimized functions and the rest of pixman fast path API.
diff --git a/pixman/pixman/pixman-arm-simd.c b/pixman/pixman/pixman-arm-simd.c
index 45981a6b0..3d19bfac1 100644
--- a/pixman/pixman/pixman-arm-simd.c
+++ b/pixman/pixman/pixman-arm-simd.c
@@ -1,432 +1,432 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include "pixman-private.h"
-#include "pixman-arm-common.h"
-#include "pixman-fast-path.h"
-
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8_8_asm_armv6 (int32_t  width,
-				    int32_t  height,
-				    uint8_t *dst_line,
-				    int32_t  dst_stride,
-				    uint8_t *src_line,
-				    int32_t  src_stride)
-{
-    uint8_t *dst, *src;
-    int32_t w;
-    uint8_t s, d;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	/* ensure both src and dst are properly aligned before doing 32 bit reads
-	 * we'll stay in this loop if src and dst have differing alignments
-	 */
-	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
-
-	    dst++;
-	    src++;
-	    w--;
-	}
-
-	while (w >= 4)
-	{
-	    asm ("uqadd8 %0, %1, %2"
-		 : "=r" (*(uint32_t*)dst)
-		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
-	    dst += 4;
-	    src += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    s = *src;
-	    d = *dst;
-	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
-	    *dst = d;
-
-	    dst++;
-	    src++;
-	    w--;
-	}
-    }
-
-}
-
-void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
-                                           int32_t   height,
-                                           uint32_t *dst_line,
-                                           int32_t   dst_stride,
-                                           uint32_t *src_line,
-                                           int32_t   src_stride)
-{
-    uint32_t    *dst;
-    uint32_t    *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t upper_component_mask = 0xff00ff00;
-    uint32_t alpha_mask = 0xff;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    "ldr r4, [%[dest]] \n\t"
-
-#else
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* = 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* multiply by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
-	    "and r7, r7, %[upper_component_mask]\n\t"
-	    "uxtab16 r6, r7, r6, ror #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-	    );
-    }
-}
-
-void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
-                                             int32_t   height,
-                                             uint32_t *dst_line,
-                                             int32_t   dst_stride,
-                                             uint32_t *src_line,
-                                             int32_t   src_stride,
-                                             uint32_t  mask)
-{
-    uint32_t *dst;
-    uint32_t *src;
-    int32_t w;
-    uint32_t component_half = 0x800080;
-    uint32_t alpha_mask = 0xff;
-
-    mask = (mask) >> 24;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load src */
-	    "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    "uxtb16 r6, r5\n\t"
-	    "uxtb16 r7, r5, ror #8\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
-	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* 255 - alpha */
-	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-	      [alpha_mask] "r" (alpha_mask)
-	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
-	    );
-    }
-}
-
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
-                                          int32_t   height,
-                                          uint32_t *dst_line,
-                                          int32_t   dst_stride,
-                                          uint32_t  src,
-                                          int32_t   unused,
-                                          uint8_t  *mask_line,
-                                          int32_t   mask_stride)
-{
-    uint32_t  srca;
-    uint32_t *dst;
-    uint8_t  *mask;
-    int32_t w;
-
-    srca = src >> 24;
-
-    uint32_t component_mask = 0xff00ff;
-    uint32_t component_half = 0x800080;
-
-    uint32_t src_hi = (src >> 8) & component_mask;
-    uint32_t src_lo = src & component_mask;
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-/* #define inner_branch */
-	asm volatile (
-	    "cmp %[w], #0\n\t"
-	    "beq 2f\n\t"
-	    "1:\n\t"
-	    /* load mask */
-	    "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
-	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-	     * The 0x0 case also allows us to avoid doing an unecessary data
-	     * write which is more valuable so we only check for that
-	     */
-	    "cmp r5, #0\n\t"
-	    "beq 3f\n\t"
-
-#endif
-	    "ldr r4, [%[dest]] \n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
-	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r5, r6, r7, lsl #8\n\t"
-
-	    "uxtb16 r6, r4\n\t"
-	    "uxtb16 r7, r4, ror #8\n\t"
-
-	    /* we could simplify this to use 'sub' if we were
-	     * willing to give up a register for alpha_mask
-	     */
-	    "mvn r8, r5\n\t"
-	    "mov r8, r8, lsr #24\n\t"
-
-	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
-	    "mla r6, r6, r8, %[component_half]\n\t"
-	    "mla r7, r7, r8, %[component_half]\n\t"
-
-	    "uxtab16 r6, r6, r6, ror #8\n\t"
-	    "uxtab16 r7, r7, r7, ror #8\n\t"
-
-	    "uxtb16 r6, r6, ror #8\n\t"
-	    "uxtb16 r7, r7, ror #8\n\t"
-
-	    /* recombine */
-	    "orr r6, r6, r7, lsl #8\n\t"
-
-	    "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
-	    "3:\n\t"
-
-#endif
-	    "str r5, [%[dest]], #4\n\t"
-	    /* increment counter and jmp to top */
-	    "subs	%[w], %[w], #1\n\t"
-	    "bne	1b\n\t"
-	    "2:\n\t"
-	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
-	    : [component_half] "r" (component_half),
-	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
-	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
-    }
-}
-
-#endif
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
-                                   uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
-                                   uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
-                                     uint32_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
-                                      uint8_t, 1, uint32_t, 1)
-
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
-                                        uint16_t, uint16_t)
-PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
-                                        uint32_t, uint32_t)
-
-static const pixman_fast_path_t arm_simd_fast_paths[] =
-{
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
-    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
-
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
-
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
-    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
-
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
-
-    { PIXMAN_OP_NONE },
-};
-
-pixman_implementation_t *
-_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
-{
-    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
-
-    return imp;
-}
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-inlines.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t  width,
+				    int32_t  height,
+				    uint8_t *dst_line,
+				    int32_t  dst_stride,
+				    uint8_t *src_line,
+				    int32_t  src_stride)
+{
+    uint8_t *dst, *src;
+    int32_t w;
+    uint8_t s, d;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* ensure both src and dst are properly aligned before doing 32 bit reads
+	 * we'll stay in this loop if src and dst have differing alignments
+	 */
+	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    asm ("uqadd8 %0, %1, %2"
+		 : "=r" (*(uint32_t*)dst)
+		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
+                                           int32_t   height,
+                                           uint32_t *dst_line,
+                                           int32_t   dst_stride,
+                                           uint32_t *src_line,
+                                           int32_t   src_stride)
+{
+    uint32_t    *dst;
+    uint32_t    *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t upper_component_mask = 0xff00ff00;
+    uint32_t alpha_mask = 0xff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    "ldr r4, [%[dest]] \n\t"
+
+#else
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* multiply by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
+	    "and r7, r7, %[upper_component_mask]\n\t"
+	    "uxtab16 r6, r7, r6, ror #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
+                                             int32_t   height,
+                                             uint32_t *dst_line,
+                                             int32_t   dst_stride,
+                                             uint32_t *src_line,
+                                             int32_t   src_stride,
+                                             uint32_t  mask)
+{
+    uint32_t *dst;
+    uint32_t *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t alpha_mask = 0xff;
+
+    mask = (mask) >> 24;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    "uxtb16 r6, r5\n\t"
+	    "uxtb16 r7, r5, ror #8\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
+                                          int32_t   height,
+                                          uint32_t *dst_line,
+                                          int32_t   dst_stride,
+                                          uint32_t  src,
+                                          int32_t   unused,
+                                          uint8_t  *mask_line,
+                                          int32_t   mask_stride)
+{
+    uint32_t  srca;
+    uint32_t *dst;
+    uint8_t  *mask;
+    int32_t w;
+
+    srca = src >> 24;
+
+    uint32_t component_mask = 0xff00ff;
+    uint32_t component_half = 0x800080;
+
+    uint32_t src_hi = (src >> 8) & component_mask;
+    uint32_t src_lo = src & component_mask;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load mask */
+	    "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
+	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* we could simplify this to use 'sub' if we were
+	     * willing to give up a register for alpha_mask
+	     */
+	    "mvn r8, r5\n\t"
+	    "mov r8, r8, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+	    : [component_half] "r" (component_half),
+	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman/pixman-bits-image.c b/pixman/pixman/pixman-bits-image.c
index 40ccaa8bd..f540c76e1 100644
--- a/pixman/pixman/pixman-bits-image.c
+++ b/pixman/pixman/pixman-bits-image.c
@@ -1,1581 +1,1463 @@
-/*
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *             2008 Aaron Plattner, NVIDIA Corporation
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007, 2009 Red Hat, Inc.
- * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-
-/*
- * By default, just evaluate the image at 32bpp and expand.  Individual image
- * types can plug in a better scanline getter if they want to. For example
- * we  could produce smoother gradients by evaluating them at higher color
- * depth, but that's a project for the future.
- */
-static void
-_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
-                                       int              x,
-                                       int              y,
-                                       int              width,
-                                       uint32_t *       buffer,
-                                       const uint32_t * mask)
-{
-    uint32_t *mask8 = NULL;
-
-    /* Contract the mask image, if one exists, so that the 32-bit fetch
-     * function can use it.
-     */
-    if (mask)
-    {
-	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
-	if (!mask8)
-	    return;
-
-	pixman_contract (mask8, (uint64_t *)mask, width);
-    }
-
-    /* Fetch the source image into the first half of buffer. */
-    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
-
-    /* Expand from 32bpp to 64bpp in place. */
-    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
-
-    free (mask8);
-}
-
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
-		      int x, int y, pixman_bool_t check_bounds)
-{
-    if (check_bounds &&
-	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    return image->fetch_pixel_32 (image, x, y);
-}
-
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
-				  int x, int y, pixman_bool_t check_bounds);
-
-static force_inline void
-repeat (pixman_repeat_t repeat, int size, int *coord)
-{
-    switch (repeat)
-    {
-    case PIXMAN_REPEAT_NORMAL:
-	*coord = MOD (*coord, size);
-	break;
-
-    case PIXMAN_REPEAT_PAD:
-	*coord = CLIP (*coord, 0, size - 1);
-	break;
-
-    case PIXMAN_REPEAT_REFLECT:
-	*coord = MOD (*coord, size * 2);
-
-	if (*coord >= size)
-	    *coord = size * 2 - *coord - 1;
-	break;
-
-    case PIXMAN_REPEAT_NONE:
-	break;
-
-    default:
-        break;
-    }
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_nearest (bits_image_t   *image,
-				pixman_fixed_t  x,
-				pixman_fixed_t  y,
-				get_pixel_t	get_pixel)
-{
-    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-    if (image->common.repeat != PIXMAN_REPEAT_NONE)
-    {
-	repeat (image->common.repeat, image->width, &x0);
-	repeat (image->common.repeat, image->height, &y0);
-
-	return get_pixel (image, x0, y0, FALSE);
-    }
-    else
-    {
-	return get_pixel (image, x0, y0, TRUE);
-    }
-}
-
-#if SIZEOF_LONG > 4
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    uint64_t distxy, distxiy, distixy, distixiy;
-    uint64_t tl64, tr64, bl64, br64;
-    uint64_t f, r;
-
-    distxy = distx * disty;
-    distxiy = distx * (256 - disty);
-    distixy = (256 - distx) * disty;
-    distixiy = (256 - distx) * (256 - disty);
-
-    /* Alpha and Blue */
-    tl64 = tl & 0xff0000ff;
-    tr64 = tr & 0xff0000ff;
-    bl64 = bl & 0xff0000ff;
-    br64 = br & 0xff0000ff;
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r = f & 0x0000ff0000ff0000ull;
-
-    /* Red and Green */
-    tl64 = tl;
-    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
-
-    tr64 = tr;
-    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
-
-    bl64 = bl;
-    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
-
-    br64 = br;
-    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
-
-    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
-    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
-
-    return (uint32_t)(r >> 16);
-}
-
-#else
-
-static force_inline uint32_t
-bilinear_interpolation (uint32_t tl, uint32_t tr,
-			uint32_t bl, uint32_t br,
-			int distx, int disty)
-{
-    int distxy, distxiy, distixy, distixiy;
-    uint32_t f, r;
-
-    distxy = distx * disty;
-    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
-    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
-    distixiy =
-	256 * 256 - (disty << 8) -
-	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
-
-    /* Blue */
-    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-
-    /* Green */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    tl >>= 16;
-    tr >>= 16;
-    bl >>= 16;
-    br >>= 16;
-    r >>= 16;
-
-    /* Red */
-    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
-      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
-    r |= f & 0x00ff0000;
-
-    /* Alpha */
-    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
-      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
-    r |= f & 0xff000000;
-
-    return r;
-}
-
-#endif
-
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t   *image,
-				 pixman_fixed_t  x,
-				 pixman_fixed_t  y,
-				 get_pixel_t	 get_pixel)
-{
-    pixman_repeat_t repeat_mode = image->common.repeat;
-    int width = image->width;
-    int height = image->height;
-    int x1, y1, x2, y2;
-    uint32_t tl, tr, bl, br;
-    int32_t distx, disty;
-
-    x1 = x - pixman_fixed_1 / 2;
-    y1 = y - pixman_fixed_1 / 2;
-
-    distx = (x1 >> 8) & 0xff;
-    disty = (y1 >> 8) & 0xff;
-
-    x1 = pixman_fixed_to_int (x1);
-    y1 = pixman_fixed_to_int (y1);
-    x2 = x1 + 1;
-    y2 = y1 + 1;
-
-    if (repeat_mode != PIXMAN_REPEAT_NONE)
-    {
-	repeat (repeat_mode, width, &x1);
-	repeat (repeat_mode, height, &y1);
-	repeat (repeat_mode, width, &x2);
-	repeat (repeat_mode, height, &y2);
-
-	tl = get_pixel (image, x1, y1, FALSE);
-	bl = get_pixel (image, x1, y2, FALSE);
-	tr = get_pixel (image, x2, y1, FALSE);
-	br = get_pixel (image, x2, y2, FALSE);
-    }
-    else
-    {
-	tl = get_pixel (image, x1, y1, TRUE);
-	tr = get_pixel (image, x2, y1, TRUE);
-	bl = get_pixel (image, x1, y2, TRUE);
-	br = get_pixel (image, x2, y2, TRUE);
-    }
-
-    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
-}
-
-static void
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
-					  int              offset,
-					  int              line,
-					  int              width,
-					  uint32_t *       buffer,
-					  const uint32_t * mask)
-{
-    bits_image_t *bits = &ima->bits;
-    pixman_fixed_t x_top, x_bottom, x;
-    pixman_fixed_t ux_top, ux_bottom, ux;
-    pixman_vector_t v;
-    uint32_t top_mask, bottom_mask;
-    uint32_t *top_row;
-    uint32_t *bottom_row;
-    uint32_t *end;
-    uint32_t zero[2] = { 0, 0 };
-    uint32_t one = 1;
-    int y, y1, y2;
-    int disty;
-    int mask_inc;
-    int w;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (bits->common.transform, &v))
-	return;
-
-    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
-    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
-    y = v.vector[1] - pixman_fixed_1/2;
-    disty = (y >> 8) & 0xff;
-
-    /* Load the pointers to the first and second lines from the source
-     * image that bilinear code must read.
-     *
-     * The main trick in this code is about the check if any line are
-     * outside of the image;
-     *
-     * When I realize that a line (any one) is outside, I change
-     * the pointer to a dummy area with zeros. Once I change this, I
-     * must be sure the pointer will not change, so I set the
-     * variables to each pointer increments inside the loop.
-     */
-    y1 = pixman_fixed_to_int (y);
-    y2 = y1 + 1;
-
-    if (y1 < 0 || y1 >= bits->height)
-    {
-	top_row = zero;
-	x_top = 0;
-	ux_top = 0;
-    }
-    else
-    {
-	top_row = bits->bits + y1 * bits->rowstride;
-	x_top = x;
-	ux_top = ux;
-    }
-
-    if (y2 < 0 || y2 >= bits->height)
-    {
-	bottom_row = zero;
-	x_bottom = 0;
-	ux_bottom = 0;
-    }
-    else
-    {
-	bottom_row = bits->bits + y2 * bits->rowstride;
-	x_bottom = x;
-	ux_bottom = ux;
-    }
-
-    /* Instead of checking whether the operation uses the mast in
-     * each loop iteration, verify this only once and prepare the
-     * variables to make the code smaller inside the loop.
-     */
-    if (!mask)
-    {
-        mask_inc = 0;
-        mask = &one;
-    }
-    else
-    {
-        /* If have a mask, prepare the variables to check it */
-        mask_inc = 1;
-    }
-
-    /* If both are zero, then the whole thing is zero */
-    if (top_row == zero && bottom_row == zero)
-    {
-	memset (buffer, 0, width * sizeof (uint32_t));
-	return;
-    }
-    else if (bits->format == PIXMAN_x8r8g8b8)
-    {
-	if (top_row == zero)
-	{
-	    top_mask = 0;
-	    bottom_mask = 0xff000000;
-	}
-	else if (bottom_row == zero)
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0;
-	}
-	else
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0xff000000;
-	}
-    }
-    else
-    {
-	top_mask = 0;
-	bottom_mask = 0;
-    }
-
-    end = buffer + width;
-
-    /* Zero fill to the left of the image */
-    while (buffer < end && x < pixman_fixed_minus_1)
-    {
-	*buffer++ = 0;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Left edge
-     */
-    while (buffer < end && x < 0)
-    {
-	uint32_t tr, br;
-	int32_t distx;
-
-	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
-	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	distx = (x >> 8) & 0xff;
-
-	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Main part */
-    w = pixman_int_to_fixed (bits->width - 1);
-
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, tr, bl, br;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	    distx = (x >> 8) & 0xff;
-
-	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Right Edge */
-    w = pixman_int_to_fixed (bits->width);
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, bl;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
-	    distx = (x >> 8) & 0xff;
-
-	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Zero fill to the left of the image */
-    while (buffer < end)
-	*buffer++ = 0;
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_convolution (bits_image_t   *image,
-				    pixman_fixed_t  x,
-				    pixman_fixed_t  y,
-				    get_pixel_t     get_pixel)
-{
-    pixman_fixed_t *params = image->common.filter_params;
-    int x_off = (params[0] - pixman_fixed_1) >> 1;
-    int y_off = (params[1] - pixman_fixed_1) >> 1;
-    int32_t cwidth = pixman_fixed_to_int (params[0]);
-    int32_t cheight = pixman_fixed_to_int (params[1]);
-    int32_t srtot, sgtot, sbtot, satot;
-    int32_t i, j, x1, x2, y1, y2;
-    pixman_repeat_t repeat_mode = image->common.repeat;
-    int width = image->width;
-    int height = image->height;
-
-    params += 2;
-
-    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
-    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
-    x2 = x1 + cwidth;
-    y2 = y1 + cheight;
-
-    srtot = sgtot = sbtot = satot = 0;
-
-    for (i = y1; i < y2; ++i)
-    {
-	for (j = x1; j < x2; ++j)
-	{
-	    int rx = j;
-	    int ry = i;
-
-	    pixman_fixed_t f = *params;
-
-	    if (f)
-	    {
-		uint32_t pixel;
-
-		if (repeat_mode != PIXMAN_REPEAT_NONE)
-		{
-		    repeat (repeat_mode, width, &rx);
-		    repeat (repeat_mode, height, &ry);
-
-		    pixel = get_pixel (image, rx, ry, FALSE);
-		}
-		else
-		{
-		    pixel = get_pixel (image, rx, ry, TRUE);
-		}
-
-		srtot += RED_8 (pixel) * f;
-		sgtot += GREEN_8 (pixel) * f;
-		sbtot += BLUE_8 (pixel) * f;
-		satot += ALPHA_8 (pixel) * f;
-	    }
-
-	    params++;
-	}
-    }
-
-    satot >>= 16;
-    srtot >>= 16;
-    sgtot >>= 16;
-    sbtot >>= 16;
-
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
-
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
-}
-
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
-				 pixman_fixed_t x,
-				 pixman_fixed_t y,
-				 get_pixel_t    get_pixel)
-{
-    switch (image->common.filter)
-    {
-    case PIXMAN_FILTER_NEAREST:
-    case PIXMAN_FILTER_FAST:
-	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
-	break;
-
-    case PIXMAN_FILTER_BILINEAR:
-    case PIXMAN_FILTER_GOOD:
-    case PIXMAN_FILTER_BEST:
-	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
-	break;
-
-    case PIXMAN_FILTER_CONVOLUTION:
-	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
-	break;
-
-    default:
-        break;
-    }
-
-    return 0;
-}
-
-static void
-bits_image_fetch_affine_no_alpha (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
-
-	ux = image->common.transform->matrix[0][0];
-	uy = image->common.transform->matrix[1][0];
-    }
-    else
-    {
-	ux = pixman_fixed_1;
-	uy = 0;
-    }
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	if (!mask || mask[i])
-	{
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x, y, fetch_pixel_no_alpha);
-	}
-
-	x += ux;
-	y += uy;
-    }
-}
-
-/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
-{
-    uint32_t pixel;
-
-    if (check_bounds &&
-	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    pixel = image->fetch_pixel_32 (image, x, y);
-
-    if (image->common.alpha_map)
-    {
-	uint32_t pixel_a;
-
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	if (x < 0 || x >= image->common.alpha_map->width ||
-	    y < 0 || y >= image->common.alpha_map->height)
-	{
-	    pixel_a = 0;
-	}
-	else
-	{
-	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
-		image->common.alpha_map, x, y);
-
-	    pixel_a = ALPHA_8 (pixel_a);
-	}
-
-	pixel &= 0x00ffffff;
-	pixel |= (pixel_a << 24);
-    }
-
-    return pixel;
-}
-
-static void
-bits_image_fetch_general (pixman_image_t * image,
-			  int              offset,
-			  int              line,
-			  int              width,
-			  uint32_t *       buffer,
-			  const uint32_t * mask)
-{
-    pixman_fixed_t x, y, w;
-    pixman_fixed_t ux, uy, uw;
-    pixman_vector_t v;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (image->common.transform)
-    {
-	if (!pixman_transform_point_3d (image->common.transform, &v))
-	    return;
-
-	ux = image->common.transform->matrix[0][0];
-	uy = image->common.transform->matrix[1][0];
-	uw = image->common.transform->matrix[2][0];
-    }
-    else
-    {
-	ux = pixman_fixed_1;
-	uy = 0;
-	uw = 0;
-    }
-
-    x = v.vector[0];
-    y = v.vector[1];
-    w = v.vector[2];
-
-    for (i = 0; i < width; ++i)
-    {
-	pixman_fixed_t x0, y0;
-
-	if (!mask || mask[i])
-	{
-	    if (w != 0)
-	    {
-		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
-		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
-	    }
-	    else
-	    {
-		x0 = 0;
-		y0 = 0;
-	    }
-
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x0, y0, fetch_pixel_general);
-	}
-
-	x += ux;
-	y += uy;
-	w += uw;
-    }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask,
-
-				  convert_pixel_t	convert_pixel,
-				  pixman_format_code_t	format,
-				  pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int x1, y1, x2, y2;
-	uint32_t tl, tr, bl, br;
-	int32_t distx, disty;
-	int width = image->bits.width;
-	int height = image->bits.height;
-	const uint8_t *row1;
-	const uint8_t *row2;
-
-	if (mask && !mask[i])
-	    goto next;
-
-	x1 = x - pixman_fixed_1 / 2;
-	y1 = y - pixman_fixed_1 / 2;
-
-	distx = (x1 >> 8) & 0xff;
-	disty = (y1 >> 8) & 0xff;
-
-	y1 = pixman_fixed_to_int (y1);
-	y2 = y1 + 1;
-	x1 = pixman_fixed_to_int (x1);
-	x2 = x1 + 1;
-
-	if (repeat_mode != PIXMAN_REPEAT_NONE)
-	{
-	    uint32_t mask;
-
-	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    repeat (repeat_mode, width, &x1);
-	    repeat (repeat_mode, height, &y1);
-	    repeat (repeat_mode, width, &x2);
-	    repeat (repeat_mode, height, &y2);
-
-	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
-	    tl = convert_pixel (row1, x1) | mask;
-	    tr = convert_pixel (row1, x2) | mask;
-	    bl = convert_pixel (row2, x1) | mask;
-	    br = convert_pixel (row2, x2) | mask;
-	}
-	else
-	{
-	    uint32_t mask1, mask2;
-	    int bpp;
-
-	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
-	     * which means if you use it in expressions, those
-	     * expressions become unsigned themselves. Since
-	     * the variables below can be negative in some cases,
-	     * that will lead to crashes on 64 bit architectures.
-	     *
-	     * So this line makes sure bpp is signed
-	     */
-	    bpp = PIXMAN_FORMAT_BPP (format);
-
-	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
-	    {
-		buffer[i] = 0;
-		goto next;
-	    }
-
-	    if (y2 == 0)
-	    {
-		row1 = zero;
-		mask1 = 0;
-	    }
-	    else
-	    {
-		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-		row1 += bpp / 8 * x1;
-
-		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (y1 == height - 1)
-	    {
-		row2 = zero;
-		mask2 = 0;
-	    }
-	    else
-	    {
-		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-		row2 += bpp / 8 * x1;
-
-		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (x2 == 0)
-	    {
-		tl = 0;
-		bl = 0;
-	    }
-	    else
-	    {
-		tl = convert_pixel (row1, 0) | mask1;
-		bl = convert_pixel (row2, 0) | mask2;
-	    }
-
-	    if (x1 == width - 1)
-	    {
-		tr = 0;
-		br = 0;
-	    }
-	    else
-	    {
-		tr = convert_pixel (row1, 1) | mask1;
-		br = convert_pixel (row2, 1) | mask2;
-	    }
-	}
-
-	buffer[i] = bilinear_interpolation (
-	    tl, tr, bl, br, distx, disty);
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
-				 int              offset,
-				 int              line,
-				 int              width,
-				 uint32_t *       buffer,
-				 const uint32_t * mask,
-				 
-				 convert_pixel_t	convert_pixel,
-				 pixman_format_code_t	format,
-				 pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int width, height, x0, y0;
-	const uint8_t *row;
-
-	if (mask && !mask[i])
-	    goto next;
-	
-	width = image->bits.width;
-	height = image->bits.height;
-	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-	if (repeat_mode == PIXMAN_REPEAT_NONE &&
-	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
-	{
-	    buffer[i] = 0;
-	}
-	else
-	{
-	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    if (repeat_mode != PIXMAN_REPEAT_NONE)
-	    {
-		repeat (repeat_mode, width, &x0);
-		repeat (repeat_mode, height, &y0);
-	    }
-
-	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
-	    buffer[i] = convert_pixel (row, x0) | mask;
-	}
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
-    return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
-    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
-    static void								\
-    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
-					       int              offset,	\
-					       int              line,	\
-					       int              width,	\
-					       uint32_t *       buffer,	\
-					       const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_bilinear_affine (image, offset, line,		\
-					  width, buffer, mask,		\
-					  convert_ ## format,		\
-					  PIXMAN_ ## format,		\
-					  repeat_mode);			\
-    }
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
-    static void								\
-    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,	\
-					      int              offset,	\
-					      int              line,	\
-					      int              width,	\
-					      uint32_t *       buffer,	\
-					      const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_nearest_affine (image, offset, line,		\
-					 width, buffer, mask,		\
-					 convert_ ## format,		\
-					 PIXMAN_ ## format,		\
-					 repeat_mode);			\
-    }
-
-#define MAKE_FETCHERS(name, format, repeat_mode)			\
-    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
-
-MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
-
-static void
-bits_image_fetch_solid_32 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       buffer,
-                           const uint32_t * mask)
-{
-    uint32_t color;
-    uint32_t *end;
-
-    color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
-}
-
-static void
-bits_image_fetch_solid_64 (pixman_image_t * image,
-                           int              x,
-                           int              y,
-                           int              width,
-                           uint32_t *       b,
-                           const uint32_t * unused)
-{
-    uint64_t color;
-    uint64_t *buffer = (uint64_t *)b;
-    uint64_t *end;
-
-    color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
-
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
-}
-
-static void
-bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
-                                            pixman_bool_t wide,
-                                            int           x,
-                                            int           y,
-                                            int           width,
-                                            uint32_t *    buffer)
-{
-    uint32_t w;
-
-    if (y < 0 || y >= image->height)
-    {
-	memset (buffer, 0, width * (wide? 8 : 4));
-	return;
-    }
-
-    if (x < 0)
-    {
-	w = MIN (width, -x);
-
-	memset (buffer, 0, w * (wide ? 8 : 4));
-
-	width -= w;
-	buffer += w * (wide? 2 : 1);
-	x += w;
-    }
-
-    if (x < image->width)
-    {
-	w = MIN (width, image->width - x);
-
-	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
-	width -= w;
-	buffer += w * (wide? 2 : 1);
-	x += w;
-    }
-
-    memset (buffer, 0, width * (wide ? 8 : 4));
-}
-
-static void
-bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
-                                              pixman_bool_t wide,
-                                              int           x,
-                                              int           y,
-                                              int           width,
-                                              uint32_t *    buffer)
-{
-    uint32_t w;
-
-    while (y < 0)
-	y += image->height;
-
-    while (y >= image->height)
-	y -= image->height;
-
-    while (width)
-    {
-	while (x < 0)
-	    x += image->width;
-	while (x >= image->width)
-	    x -= image->width;
-
-	w = MIN (width, image->width - x);
-
-	if (wide)
-	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
-
-	buffer += w * (wide? 2 : 1);
-	x += w;
-	width -= w;
-    }
-}
-
-static void
-bits_image_fetch_untransformed_32 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * mask)
-{
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-	bits_image_fetch_untransformed_repeat_none (
-	    &image->bits, FALSE, x, y, width, buffer);
-    }
-    else
-    {
-	bits_image_fetch_untransformed_repeat_normal (
-	    &image->bits, FALSE, x, y, width, buffer);
-    }
-}
-
-static void
-bits_image_fetch_untransformed_64 (pixman_image_t * image,
-                                   int              x,
-                                   int              y,
-                                   int              width,
-                                   uint32_t *       buffer,
-                                   const uint32_t * unused)
-{
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-	bits_image_fetch_untransformed_repeat_none (
-	    &image->bits, TRUE, x, y, width, buffer);
-    }
-    else
-    {
-	bits_image_fetch_untransformed_repeat_normal (
-	    &image->bits, TRUE, x, y, width, buffer);
-    }
-}
-
-typedef struct
-{
-    pixman_format_code_t	format;
-    uint32_t			flags;
-    fetch_scanline_t		fetch_32;
-    fetch_scanline_t		fetch_64;
-} fetcher_info_t;
-
-static const fetcher_info_t fetcher_info[] =
-{
-    { PIXMAN_solid,
-      FAST_PATH_NO_ALPHA_MAP,
-      bits_image_fetch_solid_32,
-      bits_image_fetch_solid_64
-    },
-
-    { PIXMAN_any,
-      (FAST_PATH_NO_ALPHA_MAP			|
-       FAST_PATH_ID_TRANSFORM			|
-       FAST_PATH_NO_CONVOLUTION_FILTER		|
-       FAST_PATH_NO_PAD_REPEAT			|
-       FAST_PATH_NO_REFLECT_REPEAT),
-      bits_image_fetch_untransformed_32,
-      bits_image_fetch_untransformed_64
-    },
-
-#define FAST_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_X_UNIT_POSITIVE		|				\
-     FAST_PATH_Y_UNIT_ZERO		|				\
-     FAST_PATH_NONE_REPEAT		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-    { PIXMAN_a8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
-    },
-
-    { PIXMAN_x8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_64
-    },
-
-#define GENERAL_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_NEAREST_FILTER)
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_bilinear_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_64				\
-    },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_nearest_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_64				\
-    },
-
-#define AFFINE_FAST_PATHS(name, format, repeat)				\
-    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-    
-    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
-    AFFINE_FAST_PATHS (none_a8, a8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
-    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
-    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
-    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
-    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
-    /* Affine, no alpha */
-    { PIXMAN_any,
-      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
-      bits_image_fetch_affine_no_alpha,
-      _pixman_image_get_scanline_generic_64
-    },
-
-    /* General */
-    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
-
-    { PIXMAN_null },
-};
-
-static void
-bits_image_property_changed (pixman_image_t *image)
-{
-    uint32_t flags = image->common.flags;
-    pixman_format_code_t format = image->common.extended_format_code;
-    const fetcher_info_t *info;
-
-    _pixman_bits_image_setup_accessors (&image->bits);
-
-    info = fetcher_info;
-    while (info->format != PIXMAN_null)
-    {
-	if ((info->format == format || info->format == PIXMAN_any)	&&
-	    (info->flags & flags) == info->flags)
-	{
-	    image->bits.get_scanline_32 = info->fetch_32;
-	    image->bits.get_scanline_64 = info->fetch_64;
-	    break;
-	}
-
-	info++;
-    }
-}
-
-static uint32_t *
-src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    iter->image->bits.get_scanline_32 (
-	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
-    return iter->buffer;
-}
-
-static uint32_t *
-src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    iter->image->bits.get_scanline_64 (
-	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
-
-    return iter->buffer;
-}
-
-void
-_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
-{
-    if (iter->flags & ITER_NARROW)
-	iter->get_scanline = src_get_scanline_narrow;
-    else
-	iter->get_scanline = src_get_scanline_wide;
-}
-
-static uint32_t *
-dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
-{
-    pixman_image_t *image  = iter->image;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *	    buffer = iter->buffer;
-
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->fetch_scanline_32 (
-	    (pixman_image_t *)image->common.alpha_map,
-	    x, y, width, buffer, mask);
-    }
-
-    return iter->buffer;
-}
-
-static uint32_t *
-dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    uint32_t *	    buffer = iter->buffer;
-
-    image->fetch_scanline_64 (
-	(pixman_image_t *)image, x, y, width, buffer, mask);
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->fetch_scanline_64 (
-	    (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
-    }
-
-    return iter->buffer;
-}
-
-static void
-dest_write_back_narrow (pixman_iter_t *iter)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    const uint32_t *buffer = iter->buffer;
-
-    image->store_scanline_32 (image, x, y, width, buffer);
-
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->store_scanline_32 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
-
-    iter->y++;
-}
-
-static void
-dest_write_back_wide (pixman_iter_t *iter)
-{
-    bits_image_t *  image  = &iter->image->bits;
-    int             x      = iter->x;
-    int             y      = iter->y;
-    int             width  = iter->width;
-    const uint32_t *buffer = iter->buffer;
-
-    image->store_scanline_64 (image, x, y, width, buffer);
-
-    if (image->common.alpha_map)
-    {
-	x -= image->common.alpha_origin_x;
-	y -= image->common.alpha_origin_y;
-
-	image->common.alpha_map->store_scanline_64 (
-	    image->common.alpha_map, x, y, width, buffer);
-    }
-
-    iter->y++;
-}
-
-void
-_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
-{
-    if (iter->flags & ITER_NARROW)
-    {
-	if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
-	{
-	    iter->get_scanline = _pixman_iter_get_scanline_noop;
-	}
-	else
-	{
-	    iter->get_scanline = dest_get_scanline_narrow;
-	}
-	
-	iter->write_back = dest_write_back_narrow;
-    }
-    else
-    {
-	iter->get_scanline = dest_get_scanline_wide;
-	iter->write_back = dest_write_back_wide;
-    }
-}
-
-static uint32_t *
-create_bits (pixman_format_code_t format,
-             int                  width,
-             int                  height,
-             int *                rowstride_bytes)
-{
-    int stride;
-    int buf_size;
-    int bpp;
-
-    /* what follows is a long-winded way, avoiding any possibility of integer
-     * overflows, of saying:
-     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
-     */
-
-    bpp = PIXMAN_FORMAT_BPP (format);
-    if (pixman_multiply_overflows_int (width, bpp))
-	return NULL;
-
-    stride = width * bpp;
-    if (pixman_addition_overflows_int (stride, 0x1f))
-	return NULL;
-
-    stride += 0x1f;
-    stride >>= 5;
-
-    stride *= sizeof (uint32_t);
-
-    if (pixman_multiply_overflows_int (height, stride))
-	return NULL;
-
-    buf_size = height * stride;
-
-    if (rowstride_bytes)
-	*rowstride_bytes = stride;
-
-    return calloc (buf_size, 1);
-}
-
-PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t format,
-                          int                  width,
-                          int                  height,
-                          uint32_t *           bits,
-                          int                  rowstride_bytes)
-{
-    pixman_image_t *image;
-    uint32_t *free_me = NULL;
-
-    /* must be a whole number of uint32_t's
-     */
-    return_val_if_fail (
-	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-
-    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
-
-    if (!bits && width && height)
-    {
-	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
-	if (!bits)
-	    return NULL;
-    }
-
-    image = _pixman_image_allocate ();
-
-    if (!image)
-    {
-	if (free_me)
-	    free (free_me);
-
-	return NULL;
-    }
-
-    image->type = BITS;
-    image->bits.format = format;
-    image->bits.width = width;
-    image->bits.height = height;
-    image->bits.bits = bits;
-    image->bits.free_me = free_me;
-    image->bits.read_func = NULL;
-    image->bits.write_func = NULL;
-
-    /* The rowstride is stored in number of uint32_t */
-    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
-
-    image->bits.indexed = NULL;
-
-    image->common.property_changed = bits_image_property_changed;
-
-    _pixman_image_reset_clip_region (image);
-
-    return image;
-}
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+/*
+ * By default, just evaluate the image at 32bpp and expand.  Individual image
+ * types can plug in a better scanline getter if they want to. For example
+ * we  could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
+ */
+static void
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+                                       int              x,
+                                       int              y,
+                                       int              width,
+                                       uint32_t *       buffer,
+                                       const uint32_t * mask)
+{
+    uint32_t *mask8 = NULL;
+
+    /* Contract the mask image, if one exists, so that the 32-bit fetch
+     * function can use it.
+     */
+    if (mask)
+    {
+	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
+	if (!mask8)
+	    return;
+
+	pixman_contract (mask8, (uint64_t *)mask, width);
+    }
+
+    /* Fetch the source image into the first half of buffer. */
+    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
+
+    /* Expand from 32bpp to 64bpp in place. */
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
+
+    free (mask8);
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+		      int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+				  int x, int y, pixman_bool_t check_bounds);
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+				pixman_fixed_t  x,
+				pixman_fixed_t  y,
+				get_pixel_t	get_pixel)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+	repeat (image->common.repeat, &x0, image->width);
+	repeat (image->common.repeat, &y0, image->height);
+
+	return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+	return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+				 pixman_fixed_t  x,
+				 pixman_fixed_t  y,
+				 get_pixel_t	 get_pixel)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = (x1 >> 8) & 0xff;
+    disty = (y1 >> 8) & 0xff;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
+
+	tl = get_pixel (image, x1, y1, FALSE);
+	bl = get_pixel (image, x1, y2, FALSE);
+	tr = get_pixel (image, x2, y1, FALSE);
+	br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+	tl = get_pixel (image, x1, y1, TRUE);
+	tr = get_pixel (image, x2, y1, TRUE);
+	bl = get_pixel (image, x1, y2, TRUE);
+	br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+					  int              offset,
+					  int              line,
+					  int              width,
+					  uint32_t *       buffer,
+					  const uint32_t * mask)
+{
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = (y >> 8) & 0xff;
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
+    {
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
+	{
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = (x >> 8) & 0xff;
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t     get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t srtot, sgtot, sbtot, satot;
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+	for (j = x1; j < x2; ++j)
+	{
+	    int rx = j;
+	    int ry = i;
+
+	    pixman_fixed_t f = *params;
+
+	    if (f)
+	    {
+		uint32_t pixel;
+
+		if (repeat_mode != PIXMAN_REPEAT_NONE)
+		{
+		    repeat (repeat_mode, &rx, width);
+		    repeat (repeat_mode, &ry, height);
+
+		    pixel = get_pixel (image, rx, ry, FALSE);
+		}
+		else
+		{
+		    pixel = get_pixel (image, rx, ry, TRUE);
+		}
+
+		srtot += RED_8 (pixel) * f;
+		sgtot += GREEN_8 (pixel) * f;
+		sbtot += BLUE_8 (pixel) * f;
+		satot += ALPHA_8 (pixel) * f;
+	    }
+
+	    params++;
+	}
+    }
+
+    satot >>= 16;
+    srtot >>= 16;
+    sgtot >>= 16;
+    sbtot >>= 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+				 pixman_fixed_t x,
+				 pixman_fixed_t y,
+				 get_pixel_t    get_pixel)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	break;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	if (!mask || mask[i])
+	{
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x, y, fetch_pixel_no_alpha);
+	}
+
+	x += ux;
+	y += uy;
+    }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    uint32_t pixel;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    pixel = image->fetch_pixel_32 (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	uint32_t pixel_a;
+
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    pixel_a = 0;
+	}
+	else
+	{
+	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
+		image->common.alpha_map, x, y);
+
+	    pixel_a = ALPHA_8 (pixel_a);
+	}
+
+	pixel &= 0x00ffffff;
+	pixel |= (pixel_a << 24);
+    }
+
+    return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+			  int              offset,
+			  int              line,
+			  int              width,
+			  uint32_t *       buffer,
+			  const uint32_t * mask)
+{
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+	uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+	uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    for (i = 0; i < width; ++i)
+    {
+	pixman_fixed_t x0, y0;
+
+	if (!mask || mask[i])
+	{
+	    if (w != 0)
+	    {
+		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+	    }
+	    else
+	    {
+		x0 = 0;
+		y0 = 0;
+	    }
+
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x0, y0, fetch_pixel_general);
+	}
+
+	x += ux;
+	y += uy;
+	w += uw;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask,
+
+				  convert_pixel_t	convert_pixel,
+				  pixman_format_code_t	format,
+				  pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int x1, y1, x2, y2;
+	uint32_t tl, tr, bl, br;
+	int32_t distx, disty;
+	int width = image->bits.width;
+	int height = image->bits.height;
+	const uint8_t *row1;
+	const uint8_t *row2;
+
+	if (mask && !mask[i])
+	    goto next;
+
+	x1 = x - pixman_fixed_1 / 2;
+	y1 = y - pixman_fixed_1 / 2;
+
+	distx = (x1 >> 8) & 0xff;
+	disty = (y1 >> 8) & 0xff;
+
+	y1 = pixman_fixed_to_int (y1);
+	y2 = y1 + 1;
+	x1 = pixman_fixed_to_int (x1);
+	x2 = x1 + 1;
+
+	if (repeat_mode != PIXMAN_REPEAT_NONE)
+	{
+	    uint32_t mask;
+
+	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    repeat (repeat_mode, &x1, width);
+	    repeat (repeat_mode, &y1, height);
+	    repeat (repeat_mode, &x2, width);
+	    repeat (repeat_mode, &y2, height);
+
+	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+	    tl = convert_pixel (row1, x1) | mask;
+	    tr = convert_pixel (row1, x2) | mask;
+	    bl = convert_pixel (row2, x1) | mask;
+	    br = convert_pixel (row2, x2) | mask;
+	}
+	else
+	{
+	    uint32_t mask1, mask2;
+	    int bpp;
+
+	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+	     * which means if you use it in expressions, those
+	     * expressions become unsigned themselves. Since
+	     * the variables below can be negative in some cases,
+	     * that will lead to crashes on 64 bit architectures.
+	     *
+	     * So this line makes sure bpp is signed
+	     */
+	    bpp = PIXMAN_FORMAT_BPP (format);
+
+	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+	    {
+		buffer[i] = 0;
+		goto next;
+	    }
+
+	    if (y2 == 0)
+	    {
+		row1 = zero;
+		mask1 = 0;
+	    }
+	    else
+	    {
+		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 += bpp / 8 * x1;
+
+		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (y1 == height - 1)
+	    {
+		row2 = zero;
+		mask2 = 0;
+	    }
+	    else
+	    {
+		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 += bpp / 8 * x1;
+
+		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (x2 == 0)
+	    {
+		tl = 0;
+		bl = 0;
+	    }
+	    else
+	    {
+		tl = convert_pixel (row1, 0) | mask1;
+		bl = convert_pixel (row2, 0) | mask2;
+	    }
+
+	    if (x1 == width - 1)
+	    {
+		tr = 0;
+		br = 0;
+	    }
+	    else
+	    {
+		tr = convert_pixel (row1, 1) | mask1;
+		br = convert_pixel (row2, 1) | mask2;
+	    }
+	}
+
+	buffer[i] = bilinear_interpolation (
+	    tl, tr, bl, br, distx, disty);
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, &x0, width);
+		repeat (repeat_mode, &y0, height);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
+    static void								\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
+					       int              offset,	\
+					       int              line,	\
+					       int              width,	\
+					       uint32_t *       buffer,	\
+					       const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_bilinear_affine (image, offset, line,		\
+					  width, buffer, mask,		\
+					  convert_ ## format,		\
+					  PIXMAN_ ## format,		\
+					  repeat_mode);			\
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static void								\
+    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,	\
+					      int              offset,	\
+					      int              line,	\
+					      int              width,	\
+					      uint32_t *       buffer,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (image, offset, line,		\
+					 width, buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       buffer,
+                           const uint32_t * mask)
+{
+    uint32_t color;
+    uint32_t *end;
+
+    color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       b,
+                           const uint32_t * unused)
+{
+    uint64_t color;
+    uint64_t *buffer = (uint64_t *)b;
+    uint64_t *end;
+
+    color = image->bits.fetch_pixel_64 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
+{
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+	memset (buffer, 0, width * (wide? 8 : 4));
+	return;
+    }
+
+    if (x < 0)
+    {
+	w = MIN (width, -x);
+
+	memset (buffer, 0, w * (wide ? 8 : 4));
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    if (x < image->width)
+    {
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
+{
+    uint32_t w;
+
+    while (y < 0)
+	y += image->height;
+
+    while (y >= image->height)
+	y -= image->height;
+
+    while (width)
+    {
+	while (x < 0)
+	    x += image->width;
+	while (x >= image->width)
+	    x -= image->width;
+
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	buffer += w * (wide? 2 : 1);
+	x += w;
+	width -= w;
+    }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * mask)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * unused)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    uint32_t			flags;
+    fetch_scanline_t		fetch_32;
+    fetch_scanline_t		fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP,
+      bits_image_fetch_solid_32,
+      bits_image_fetch_solid_64
+    },
+
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP			|
+       FAST_PATH_ID_TRANSFORM			|
+       FAST_PATH_NO_CONVOLUTION_FILTER		|
+       FAST_PATH_NO_PAD_REPEAT			|
+       FAST_PATH_NO_REFLECT_REPEAT),
+      bits_image_fetch_untransformed_32,
+      bits_image_fetch_untransformed_64
+    },
+
+#define FAST_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_X_UNIT_POSITIVE		|				\
+     FAST_PATH_Y_UNIT_ZERO		|				\
+     FAST_PATH_NONE_REPEAT		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+#define GENERAL_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_bilinear_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_nearest_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    /* Affine, no alpha */
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+      bits_image_fetch_affine_no_alpha,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    /* General */
+    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+    { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+    uint32_t flags = image->common.flags;
+    pixman_format_code_t format = image->common.extended_format_code;
+    const fetcher_info_t *info;
+
+    _pixman_bits_image_setup_accessors (&image->bits);
+
+    info = fetcher_info;
+    while (info->format != PIXMAN_null)
+    {
+	if ((info->format == format || info->format == PIXMAN_any)	&&
+	    (info->flags & flags) == info->flags)
+	{
+	    image->bits.get_scanline_32 = info->fetch_32;
+	    image->bits.get_scanline_64 = info->fetch_64;
+	    break;
+	}
+
+	info++;
+    }
+}
+
+static uint32_t *
+src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_32 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+static uint32_t *
+src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_64 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = src_get_scanline_narrow;
+    else
+	iter->get_scanline = src_get_scanline_wide;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_32 (
+	    (pixman_image_t *)image->common.alpha_map,
+	    x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->fetch_scanline_64 (
+	(pixman_image_t *)image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_64 (
+	    (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_32 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_64 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_64 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+	if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+	{
+	    iter->get_scanline = _pixman_iter_get_scanline_noop;
+	}
+	else
+	{
+	    iter->get_scanline = dest_get_scanline_narrow;
+	}
+	
+	iter->write_back = dest_write_back_narrow;
+    }
+    else
+    {
+	iter->get_scanline = dest_get_scanline_wide;
+	iter->write_back = dest_write_back_wide;
+    }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+             int                  width,
+             int                  height,
+             int *		  rowstride_bytes)
+{
+    int stride;
+    size_t buf_size;
+    int bpp;
+
+    /* what follows is a long-winded way, avoiding any possibility of integer
+     * overflows, of saying:
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+     */
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    if (_pixman_multiply_overflows_int (width, bpp))
+	return NULL;
+
+    stride = width * bpp;
+    if (_pixman_addition_overflows_int (stride, 0x1f))
+	return NULL;
+
+    stride += 0x1f;
+    stride >>= 5;
+
+    stride *= sizeof (uint32_t);
+
+    if (_pixman_multiply_overflows_size (height, stride))
+	return NULL;
+
+    buf_size = height * stride;
+
+    if (rowstride_bytes)
+	*rowstride_bytes = stride;
+
+    return calloc (buf_size, 1);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    pixman_image_t *image;
+    uint32_t *free_me = NULL;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    if (!bits && width && height)
+    {
+	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+	if (!bits)
+	    return NULL;
+    }
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+    {
+	if (free_me)
+	    free (free_me);
+
+	return NULL;
+    }
+
+    image->type = BITS;
+    image->bits.format = format;
+    image->bits.width = width;
+    image->bits.height = height;
+    image->bits.bits = bits;
+    image->bits.free_me = free_me;
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+
+    /* The rowstride is stored in number of uint32_t */
+    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+
+    image->bits.indexed = NULL;
+
+    image->common.property_changed = bits_image_property_changed;
+
+    _pixman_image_reset_clip_region (image);
+
+    return image;
+}
diff --git a/pixman/pixman/pixman-fast-path.c b/pixman/pixman/pixman-fast-path.c
index 9a6919e85..bbdc8e8b0 100644
--- a/pixman/pixman/pixman-fast-path.c
+++ b/pixman/pixman/pixman-fast-path.c
@@ -30,7 +30,7 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 static force_inline uint32_t
 fetch_24 (uint8_t *a)
diff --git a/pixman/pixman/pixman-fast-path.h b/pixman/pixman/pixman-fast-path.h
deleted file mode 100644
index 2b55c2e74..000000000
--- a/pixman/pixman/pixman-fast-path.h
+++ /dev/null
@@ -1,1189 +0,0 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifndef PIXMAN_FAST_PATH_H__
-#define PIXMAN_FAST_PATH_H__
-
-#include "pixman-private.h"
-
-#define PIXMAN_REPEAT_COVER -1
-
-/* Flags describing input parameters to fast path macro template.
- * Turning on some flag values may indicate that
- * "some property X is available so template can use this" or
- * "some property X should be handled by template".
- *
- * FLAG_HAVE_SOLID_MASK
- *  Input mask is solid so template should handle this.
- *
- * FLAG_HAVE_NON_SOLID_MASK
- *  Input mask is bits mask so template should handle this.
- *
- * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
- * exclusive. (It's not allowed to turn both flags on)
- */
-#define FLAG_NONE				(0)
-#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
-#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
-
-/* To avoid too short repeated scanline function calls, extend source
- * scanlines having width less than below constant value.
- */
-#define REPEAT_NORMAL_MIN_WIDTH			64
-
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
-    if (repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (*c < 0 || *c >= size)
-	    return FALSE;
-    }
-    else if (repeat == PIXMAN_REPEAT_NORMAL)
-    {
-	while (*c >= size)
-	    *c -= size;
-	while (*c < 0)
-	    *c += size;
-    }
-    else if (repeat == PIXMAN_REPEAT_PAD)
-    {
-	*c = CLIP (*c, 0, size - 1);
-    }
-    else /* REFLECT */
-    {
-	*c = MOD (*c, size * 2);
-	if (*c >= size)
-	    *c = size * 2 - *c - 1;
-    }
-    return TRUE;
-}
-
-/*
- * For each scanline fetched from source image with PAD repeat:
- * - calculate how many pixels need to be padded on the left side
- * - calculate how many pixels need to be padded on the right side
- * - update width to only count pixels which are fetched from the image
- * All this information is returned via 'width', 'left_pad', 'right_pad'
- * arguments. The code is assuming that 'unit_x' is positive.
- *
- * Note: 64-bit math is used in order to avoid potential overflows, which
- *       is probably excessive in many cases. This particular function
- *       may need its own correctness test and performance tuning.
- */
-static force_inline void
-pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-				pixman_fixed_t  vx,
-				pixman_fixed_t  unit_x,
-				int32_t *       width,
-				int32_t *       left_pad,
-				int32_t *       right_pad)
-{
-    int64_t max_vx = (int64_t) source_image_width << 16;
-    int64_t tmp;
-    if (vx < 0)
-    {
-	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
-	if (tmp > *width)
-	{
-	    *left_pad = *width;
-	    *width = 0;
-	}
-	else
-	{
-	    *left_pad = (int32_t) tmp;
-	    *width -= (int32_t) tmp;
-	}
-    }
-    else
-    {
-	*left_pad = 0;
-    }
-    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
-    if (tmp < 0)
-    {
-	*right_pad = *width;
-	*width = 0;
-    }
-    else if (tmp >= *width)
-    {
-	*right_pad = 0;
-    }
-    else
-    {
-	*right_pad = *width - (int32_t) tmp;
-	*width = (int32_t) tmp;
-    }
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
-    565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-#define GET_x888_ALPHA(s) 0xff
-
-#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
-			      src_type_t, dst_type_t, OP, repeat_mode)				\
-static force_inline void									\
-scanline_func_name (dst_type_t       *dst,							\
-		    const src_type_t *src,							\
-		    int32_t           w,							\
-		    pixman_fixed_t    vx,							\
-		    pixman_fixed_t    unit_x,							\
-		    pixman_fixed_t    max_vx,							\
-		    pixman_bool_t     fully_transparent_src)					\
-{												\
-	uint32_t   d;										\
-	src_type_t s1, s2;									\
-	uint8_t    a1, a2;									\
-	int        x1, x2;									\
-												\
-	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
-	    return;										\
-												\
-	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	    abort();										\
-												\
-	while ((w -= 2) >= 0)									\
-	{											\
-	    x1 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s1 = src[x1];									\
-												\
-	    x2 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s2 = src[x2];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-												\
-		if (a2 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-		}										\
-		else if (s2)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
-		    a2 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-	    }											\
-	}											\
-												\
-	if (w & 1)										\
-	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-	    }											\
-	}											\
-}
-
-#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
-static void											\
-fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_composite_info_t *info)               \
-{												\
-    PIXMAN_COMPOSITE_ARGS (info);					                        \
-    dst_type_t *dst_line;						                        \
-    mask_type_t *mask_line;									\
-    src_type_t *src_first_line;									\
-    int       y;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
-    pixman_fixed_t max_vy;									\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, right_pad;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    mask_type_t solid_mask;									\
-    const mask_type_t *mask = &solid_mask;							\
-    int src_stride, mask_stride, dst_stride;							\
-												\
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
-    if (have_mask)										\
-    {												\
-	if (mask_is_solid)									\
-	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
-	else											\
-	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
-				   mask_stride, mask_line, 1);					\
-    }												\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
-					&width, &left_pad, &right_pad);				\
-	vx += left_pad * unit_x;								\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-	if (have_mask && !mask_is_solid)							\
-	{											\
-	    mask = mask_line;									\
-	    mask_line += mask_stride;								\
-	}											\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, src + src_image->bits.width - 1,		\
-			       right_pad, 0, 0, 0, FALSE);					\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    static const src_type_t zero[1] = { 0 };						\
-	    if (y < 0 || y >= src_image->bits.height)						\
-	    {											\
-		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
-		continue;									\
-	    }											\
-	    src = src_first_line + src_stride * y;						\
-	    if (left_pad > 0)									\
-	    {											\
-		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    src = src_first_line + src_stride * y;						\
-	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
-	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
-
-#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
-			      repeat_mode)							\
-    static force_inline void									\
-    scanline_func##scale_func_name##_wrapper (							\
-		    const uint8_t    *mask,							\
-		    dst_type_t       *dst,							\
-		    const src_type_t *src,							\
-		    int32_t          w,								\
-		    pixman_fixed_t   vx,							\
-		    pixman_fixed_t   unit_x,							\
-		    pixman_fixed_t   max_vx,							\
-		    pixman_bool_t    fully_transparent_src)					\
-    {												\
-	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
-    }												\
-    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
-			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
-
-#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
-			      repeat_mode)							\
-	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
-			      dst_type_t, repeat_mode)
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
-		     src_type_t, dst_type_t, OP, repeat_mode)				\
-    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
-			  OP, repeat_mode)						\
-    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
-			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  src_type_t, dst_type_t, repeat_mode)
-
-
-#define SCALED_NEAREST_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_NEAREST_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
-
-#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
-
-/*****************************************************************************/
-
-/*
- * Identify 5 zones in each scanline for bilinear scaling. Depending on
- * whether 2 pixels to be interpolated are fetched from the image itself,
- * from the padding area around it or from both image and padding area.
- */
-static force_inline void
-bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
-					 pixman_fixed_t  vx,
-					 pixman_fixed_t  unit_x,
-					 int32_t *       left_pad,
-					 int32_t *       left_tz,
-					 int32_t *       width,
-					 int32_t *       right_tz,
-					 int32_t *       right_pad)
-{
-	int width1 = *width, left_pad1, right_pad1;
-	int width2 = *width, left_pad2, right_pad2;
-
-	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
-					&width1, &left_pad1, &right_pad1);
-	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
-					unit_x, &width2, &left_pad2, &right_pad2);
-
-	*left_pad = left_pad2;
-	*left_tz = left_pad1 - left_pad2;
-	*right_tz = right_pad2 - right_pad1;
-	*right_pad = right_pad1;
-	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
-}
-
-/*
- * Main loop template for single pass bilinear scaling. It needs to be
- * provided with 'scanline_func' which should do the compositing operation.
- * The needed function has the following prototype:
- *
- *	scanline_func (dst_type_t *       dst,
- *		       const mask_type_ * mask,
- *		       const src_type_t * src_top,
- *		       const src_type_t * src_bottom,
- *		       int32_t            width,
- *		       int                weight_top,
- *		       int                weight_bottom,
- *		       pixman_fixed_t     vx,
- *		       pixman_fixed_t     unit_x,
- *		       pixman_fixed_t     max_vx,
- *		       pixman_bool_t      zero_src)
- *
- * Where:
- *  dst                 - destination scanline buffer for storing results
- *  mask                - mask buffer (or single value for solid mask)
- *  src_top, src_bottom - two source scanlines
- *  width               - number of pixels to process
- *  weight_top          - weight of the top row for interpolation
- *  weight_bottom       - weight of the bottom row for interpolation
- *  vx                  - initial position for fetching the first pair of
- *                        pixels from the source buffer
- *  unit_x              - position increment needed to move to the next pair
- *                        of pixels
- *  max_vx              - image size as a fixed point value, can be used for
- *                        implementing NORMAL repeat (when it is supported)
- *  zero_src            - boolean hint variable, which is set to TRUE when
- *                        all source pixels are fetched from zero padding
- *                        zone for NONE repeat
- *
- * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
- *       but sometimes it may be less than that for NONE repeat when handling
- *       fuzzy antialiased top or bottom image edges. Also both top and
- *       bottom weight variables are guaranteed to have value in 0-255
- *       range and can fit into unsigned byte or be used with 8-bit SIMD
- *       multiplication instructions.
- */
-#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, flags)				\
-static void											\
-fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
-						   pixman_composite_info_t *info)		\
-{												\
-    PIXMAN_COMPOSITE_ARGS (info);								\
-    dst_type_t *dst_line;									\
-    mask_type_t *mask_line;									\
-    src_type_t *src_first_line;									\
-    int       y1, y2;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-    int32_t left_pad, left_tz, right_tz, right_pad;						\
-												\
-    dst_type_t *dst;										\
-    mask_type_t solid_mask;									\
-    const mask_type_t *mask = &solid_mask;							\
-    int src_stride, mask_stride, dst_stride;							\
-												\
-    int src_width;										\
-    pixman_fixed_t src_width_fixed;								\
-    int max_x;											\
-    pixman_bool_t need_src_extension;								\
-												\
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
-    if (flags & FLAG_HAVE_SOLID_MASK)								\
-    {												\
-	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
-	mask_stride = 0;									\
-    }												\
-    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
-    {												\
-	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
-			       mask_stride, mask_line, 1);					\
-    }												\
-												\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    v.vector[0] -= pixman_fixed_1 / 2;								\
-    v.vector[1] -= pixman_fixed_1 / 2;								\
-												\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
-	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
-    {												\
-	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
-					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    /* PAD repeat does not need special handling for 'transition zones' and */		\
-	    /* they can be combined with 'padding zones' safely */				\
-	    left_pad += left_tz;								\
-	    right_pad += right_tz;								\
-	    left_tz = right_tz = 0;								\
-	}											\
-	v.vector[0] += left_pad * unit_x;							\
-    }												\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	vx = v.vector[0];									\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
-	max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;				\
-												\
-	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
-	{											\
-	    src_width = 0;									\
-												\
-	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
-		src_width += src_image->bits.width;						\
-												\
-	    need_src_extension = TRUE;								\
-	}											\
-	else											\
-	{											\
-	    src_width = src_image->bits.width;							\
-	    need_src_extension = FALSE;								\
-	}											\
-												\
-	src_width_fixed = pixman_int_to_fixed (src_width);					\
-    }												\
-												\
-    while (--height >= 0)									\
-    {												\
-	int weight1, weight2;									\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-	vx = v.vector[0];									\
-	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
-	{											\
-	    mask = mask_line;									\
-	    mask_line += mask_stride;								\
-	}											\
-												\
-	y1 = pixman_fixed_to_int (vy);								\
-	weight2 = (vy >> 8) & 0xff;								\
-	if (weight2)										\
-	{											\
-	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
-	    y2 = y1 + 1;									\
-	    weight1 = 256 - weight2;								\
-	}											\
-	else											\
-	{											\
-	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
-	    y2 = y1;										\
-	    weight1 = weight2 = 128;								\
-	}											\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
-	{											\
-	    src_type_t *src1, *src2;								\
-	    src_type_t buf1[2];									\
-	    src_type_t buf2[2];									\
-	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
-	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
-	    src1 = src_first_line + src_stride * y1;						\
-	    src2 = src_first_line + src_stride * y2;						\
-												\
-	    if (left_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = src1[0];							\
-		buf2[0] = buf2[1] = src2[0];							\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
-		dst += left_pad;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_pad;								\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst, mask,							\
-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
-		dst += width;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += width;								\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
-		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
-	{											\
-	    src_type_t *src1, *src2;								\
-	    src_type_t buf1[2];									\
-	    src_type_t buf2[2];									\
-	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
-	    if (y1 < 0)										\
-	    {											\
-		weight1 = 0;									\
-		y1 = 0;										\
-	    }											\
-	    if (y1 >= src_image->bits.height)							\
-	    {											\
-		weight1 = 0;									\
-		y1 = src_image->bits.height - 1;						\
-	    }											\
-	    if (y2 < 0)										\
-	    {											\
-		weight2 = 0;									\
-		y2 = 0;										\
-	    }											\
-	    if (y2 >= src_image->bits.height)							\
-	    {											\
-		weight2 = 0;									\
-		y2 = src_image->bits.height - 1;						\
-	    }											\
-	    src1 = src_first_line + src_stride * y1;						\
-	    src2 = src_first_line + src_stride * y2;						\
-												\
-	    if (left_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = 0;								\
-		buf2[0] = buf2[1] = 0;								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
-		dst += left_pad;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_pad;								\
-	    }											\
-	    if (left_tz > 0)									\
-	    {											\
-		buf1[0] = 0;									\
-		buf1[1] = src1[0];								\
-		buf2[0] = 0;									\
-		buf2[1] = src2[0];								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, left_tz, weight1, weight2,				\
-			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
-		dst += left_tz;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += left_tz;								\
-		vx += left_tz * unit_x;								\
-	    }											\
-	    if (width > 0)									\
-	    {											\
-		scanline_func (dst, mask,							\
-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
-		dst += width;									\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += width;								\
-		vx += width * unit_x;								\
-	    }											\
-	    if (right_tz > 0)									\
-	    {											\
-		buf1[0] = src1[src_image->bits.width - 1];					\
-		buf1[1] = 0;									\
-		buf2[0] = src2[src_image->bits.width - 1];					\
-		buf2[1] = 0;									\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_tz, weight1, weight2,				\
-			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
-		dst += right_tz;								\
-		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
-		    mask += right_tz;								\
-	    }											\
-	    if (right_pad > 0)									\
-	    {											\
-		buf1[0] = buf1[1] = 0;								\
-		buf2[0] = buf2[1] = 0;								\
-		scanline_func (dst, mask,							\
-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
-	    }											\
-	}											\
-	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	{											\
-	    int32_t	    num_pixels;								\
-	    int32_t	    width_remain;							\
-	    src_type_t *    src_line_top;							\
-	    src_type_t *    src_line_bottom;							\
-	    src_type_t	    buf1[2];								\
-	    src_type_t	    buf2[2];								\
-	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
-	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
-	    int		    i, j;								\
-												\
-	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
-	    src_line_top = src_first_line + src_stride * y1;					\
-	    src_line_bottom = src_first_line + src_stride * y2;					\
-												\
-	    if (need_src_extension)								\
-	    {											\
-		for (i=0; i<src_width;)								\
-		{										\
-		    for (j=0; j<src_image->bits.width; j++, i++)				\
-		    {										\
-			extended_src_line0[i] = src_line_top[j];				\
-			extended_src_line1[i] = src_line_bottom[j];				\
-		    }										\
-		}										\
-												\
-		src_line_top = &extended_src_line0[0];						\
-		src_line_bottom = &extended_src_line1[0];					\
-	    }											\
-												\
-	    /* Top & Bottom wrap around buffer */						\
-	    buf1[0] = src_line_top[src_width - 1];						\
-	    buf1[1] = src_line_top[0];								\
-	    buf2[0] = src_line_bottom[src_width - 1];						\
-	    buf2[1] = src_line_bottom[0];							\
-												\
-	    width_remain = width;								\
-												\
-	    while (width_remain > 0)								\
-	    {											\
-		/* We use src_width_fixed because it can make vx in original source range */	\
-		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
-												\
-		/* Wrap around part */								\
-		if (pixman_fixed_to_int (vx) == src_width - 1)					\
-		{										\
-		    /* for positive unit_x							\
-		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
-		     *										\
-		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
-		     * So we are safe from overflow.						\
-		     */										\
-		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
-												\
-		    if (num_pixels > width_remain)						\
-			num_pixels = width_remain;						\
-												\
-		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
-				   weight1, weight2, pixman_fixed_frac(vx),			\
-				   unit_x, src_width_fixed, FALSE);				\
-												\
-		    width_remain -= num_pixels;							\
-		    vx += num_pixels * unit_x;							\
-		    dst += num_pixels;								\
-												\
-		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
-			mask += num_pixels;							\
-												\
-		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
-		}										\
-												\
-		/* Normal scanline composite */							\
-		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
-		{										\
-		    /* for positive unit_x							\
-		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
-		     *										\
-		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
-		     * So we are safe from overflow here.					\
-		     */										\
-		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
-				  / unit_x) + 1;						\
-												\
-		    if (num_pixels > width_remain)						\
-			num_pixels = width_remain;						\
-												\
-		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
-				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
-												\
-		    width_remain -= num_pixels;							\
-		    vx += num_pixels * unit_x;							\
-		    dst += num_pixels;								\
-												\
-		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
-		        mask += num_pixels;							\
-		}										\
-	    }											\
-	}											\
-	else											\
-	{											\
-	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
-			   src_first_line + src_stride * y2, width,				\
-			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
-	}											\
-    }												\
-}
-
-/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
-#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
-				  dst_type_t, repeat_mode, flags)				\
-	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
-				  dst_type_t, repeat_mode, flags)
-
-#define SCALED_BILINEAR_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_BILINEAR_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NARROW_FORMAT)
-
-#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_PAD_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NONE_REPEAT		|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
-    }
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_BILINEAR_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
-    }
-
-/* Prefer the use of 'cover' variant, because it is faster */
-#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
-    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
-    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
-    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
-
-#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
-    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
-
-#endif
diff --git a/pixman/pixman/pixman-inlines.h b/pixman/pixman/pixman-inlines.h
new file mode 100644
index 000000000..f1e0cbd77
--- /dev/null
+++ b/pixman/pixman/pixman-inlines.h
@@ -0,0 +1,1280 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE				(0)
+#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH			64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    max_vx,							\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)               \
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);					                        \
+    dst_type_t *dst_line;						                        \
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width - 1,		\
+			       right_pad, 0, 0, 0, FALSE);					\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ *       but sometimes it may be less than that for NONE repeat when handling
+ *       fuzzy antialiased top or bottom image edges. Also both top and
+ *       bottom weight variables are guaranteed to have value in 0-255
+ *       range and can fit into unsigned byte or be used with 8-bit SIMD
+ *       multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)		\
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);								\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    int src_width;										\
+    pixman_fixed_t src_width_fixed;								\
+    int max_x;											\
+    pixman_bool_t need_src_extension;								\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (flags & FLAG_HAVE_SOLID_MASK)								\
+    {												\
+	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	mask_stride = 0;									\
+    }												\
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+    {												\
+	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
+			       mask_stride, mask_line, 1);					\
+    }												\
+												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	vx = v.vector[0];									\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
+	max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;				\
+												\
+	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
+	{											\
+	    src_width = 0;									\
+												\
+	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
+		src_width += src_image->bits.width;						\
+												\
+	    need_src_extension = TRUE;								\
+	}											\
+	else											\
+	{											\
+	    src_width = src_image->bits.width;							\
+	    need_src_extension = FALSE;								\
+	}											\
+												\
+	src_width_fixed = pixman_int_to_fixed (src_width);					\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = (vy >> 8) & 0xff;								\
+	if (weight2)										\
+	{											\
+	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
+	    y2 = y1 + 1;									\
+	    weight1 = 256 - weight2;								\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
+	    y2 = y1;										\
+	    weight1 = weight2 = 128;								\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	{											\
+	    int32_t	    num_pixels;								\
+	    int32_t	    width_remain;							\
+	    src_type_t *    src_line_top;							\
+	    src_type_t *    src_line_bottom;							\
+	    src_type_t	    buf1[2];								\
+	    src_type_t	    buf2[2];								\
+	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    int		    i, j;								\
+												\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
+	    src_line_top = src_first_line + src_stride * y1;					\
+	    src_line_bottom = src_first_line + src_stride * y2;					\
+												\
+	    if (need_src_extension)								\
+	    {											\
+		for (i=0; i<src_width;)								\
+		{										\
+		    for (j=0; j<src_image->bits.width; j++, i++)				\
+		    {										\
+			extended_src_line0[i] = src_line_top[j];				\
+			extended_src_line1[i] = src_line_bottom[j];				\
+		    }										\
+		}										\
+												\
+		src_line_top = &extended_src_line0[0];						\
+		src_line_bottom = &extended_src_line1[0];					\
+	    }											\
+												\
+	    /* Top & Bottom wrap around buffer */						\
+	    buf1[0] = src_line_top[src_width - 1];						\
+	    buf1[1] = src_line_top[0];								\
+	    buf2[0] = src_line_bottom[src_width - 1];						\
+	    buf2[1] = src_line_bottom[0];							\
+												\
+	    width_remain = width;								\
+												\
+	    while (width_remain > 0)								\
+	    {											\
+		/* We use src_width_fixed because it can make vx in original source range */	\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
+												\
+		/* Wrap around part */								\
+		if (pixman_fixed_to_int (vx) == src_width - 1)					\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow.						\
+		     */										\
+		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
+				   weight1, weight2, pixman_fixed_frac(vx),			\
+				   unit_x, src_width_fixed, FALSE);				\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+			mask += num_pixels;							\
+												\
+		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
+		}										\
+												\
+		/* Normal scanline composite */							\
+		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow here.					\
+		     */										\
+		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
+				  / unit_x) + 1;						\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
+				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+		        mask += num_pixels;							\
+		}										\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman/pixman-noop.c b/pixman/pixman/pixman-noop.c
index 4f8c3a18e..906a491f5 100644
--- a/pixman/pixman/pixman-noop.c
+++ b/pixman/pixman/pixman-noop.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 static void
 noop_composite (pixman_implementation_t *imp,
diff --git a/pixman/pixman/pixman-private.h b/pixman/pixman/pixman-private.h
index 6a3935e4d..a25897d77 100644
--- a/pixman/pixman/pixman-private.h
+++ b/pixman/pixman/pixman-private.h
@@ -691,10 +691,13 @@ void *
 pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
 
 pixman_bool_t
-pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+_pixman_multiply_overflows_size (size_t a, size_t b);
 
 pixman_bool_t
-pixman_addition_overflows_int (unsigned int a, unsigned int b);
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
 
 /* Compositing utilities */
 void
diff --git a/pixman/pixman/pixman-region.c b/pixman/pixman/pixman-region.c
index 142493b77..47beb5238 100644
--- a/pixman/pixman/pixman-region.c
+++ b/pixman/pixman/pixman-region.c
@@ -102,7 +102,11 @@
 
 static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
 static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+#if defined (__llvm__) && !defined (__clang__)
+static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#else
 static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#endif
 
 static box_type_t *pixman_region_empty_box =
     (box_type_t *)&PREFIX (_empty_box_);
@@ -2086,6 +2090,40 @@ PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region
     return TRUE;
 }
 
+/* In time O(log n), locate the first box whose y2 is greater than y.
+ * Return @end if no such box exists.
+ */
+static box_type_t *
+find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+{
+    box_type_t *mid;
+
+    if (end == begin)
+	return end;
+
+    if (end - begin == 1)
+    {
+	if (begin->y2 > y)
+	    return begin;
+	else
+	    return end;
+    }
+
+    mid = begin + (end - begin) / 2;
+    if (mid->y2 > y)
+    {
+	/* If no box is found in [begin, mid], the function
+	 * will return @mid, which is then known to be the
+	 * correct answer.
+	 */
+	return find_box_for_y (begin, mid, y);
+    }
+    else
+    {
+	return find_box_for_y (mid, end, y);
+    }
+}
+
 /*
  *   rect_in(region, rect)
  *   This routine takes a pointer to a region and a pointer to a box
@@ -2102,7 +2140,6 @@ PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region
  *   partially in the region) or is outside the region (we reached a band
  *   that doesn't overlap the box at all and part_in is false)
  */
-
 pixman_region_overlap_t
 PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
                                             box_type_t *     prect)
@@ -2139,12 +2176,15 @@ PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
 
     /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
     for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
-         pbox != pbox_end;
-         pbox++)
+	 pbox != pbox_end;
+	 pbox++)
     {
-
-        if (pbox->y2 <= y)
-	    continue;   /* getting up to speed or skipping remainder of band */
+	/* getting up to speed or skipping remainder of band */
+	if (pbox->y2 <= y)
+	{
+	    if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end)
+		break;
+	}
 
         if (pbox->y1 > y)
         {
@@ -2342,13 +2382,13 @@ PREFIX (_contains_point) (region_type_t * region,
         return(TRUE);
     }
 
-    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
-	 pbox != pbox_end;
-	 pbox++)
-    {
-        if (y >= pbox->y2)
-	    continue;           /* not there yet */
+    pbox = PIXREGION_BOXPTR (region);
+    pbox_end = pbox + numRects;
 
+    pbox = find_box_for_y (pbox, pbox_end, y);
+
+    for (;pbox != pbox_end; pbox++)
+    {
         if ((y < pbox->y1) || (x < pbox->x1))
 	    break;              /* missed it */
 
diff --git a/pixman/pixman/pixman-sse2.c b/pixman/pixman/pixman-sse2.c
index 3d51c2fda..6689c53a2 100644
--- a/pixman/pixman/pixman-sse2.c
+++ b/pixman/pixman/pixman-sse2.c
@@ -34,7 +34,7 @@
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 static __m128i mask_0080;
 static __m128i mask_00ff;
diff --git a/pixman/pixman/pixman-utils.c b/pixman/pixman/pixman-utils.c
index cb4e62199..49e3488aa 100644
--- a/pixman/pixman/pixman-utils.c
+++ b/pixman/pixman/pixman-utils.c
@@ -31,15 +31,19 @@
 #include "pixman-private.h"
 
 pixman_bool_t
-pixman_multiply_overflows_int (unsigned int a,
-                               unsigned int b)
+_pixman_multiply_overflows_size (size_t a, size_t b)
+{
+    return a >= SIZE_MAX / b;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b)
 {
     return a >= INT32_MAX / b;
 }
 
 pixman_bool_t
-pixman_addition_overflows_int (unsigned int a,
-                               unsigned int b)
+_pixman_addition_overflows_int (unsigned int a, unsigned int b)
 {
     return a > INT32_MAX - b;
 }
diff --git a/pixman/test/Makefile.am b/pixman/test/Makefile.am
index 9dc72199e..9f61fc9e4 100644
--- a/pixman/test/Makefile.am
+++ b/pixman/test/Makefile.am
@@ -15,6 +15,7 @@ TESTPROGRAMS =			\
 	scaling-crash-test	\
 	scaling-helpers-test	\
 	gradient-crash-test	\
+	region-contains-test	\
 	alphamap		\
 	stress-test		\
 	composite-traps-test	\
@@ -26,6 +27,7 @@ TESTPROGRAMS =			\
 pdf_op_test_SOURCES = pdf-op-test.c utils.c utils.h
 region_test_SOURCES = region-test.c utils.c utils.h
 blitters_test_SOURCES = blitters-test.c utils.c utils.h
+region_contains_test_SOURCES = region-contains-test.c utils.c utils.h
 composite_traps_test_SOURCES = composite-traps-test.c utils.c utils.h
 scaling_test_SOURCES = scaling-test.c utils.c utils.h
 affine_test_SOURCES = affine-test.c utils.c utils.h
diff --git a/pixman/test/Makefile.win32 b/pixman/test/Makefile.win32
index c71afe187..a62b6fc35 100644
--- a/pixman/test/Makefile.win32
+++ b/pixman/test/Makefile.win32
@@ -35,6 +35,7 @@ SOURCES =			\
 	scaling-test.c		\
 	affine-test.c		\
 	composite.c		\
+	lowlevel-blt-bench.c	\
 	utils.c
 
 TESTS =						\
@@ -56,6 +57,8 @@ TESTS =						\
 	$(CFG_VAR)/affine-test.exe		\
 	$(CFG_VAR)/composite.exe
 
+BENCHMARKS =					\
+	$(CFG_VAR)/lowlevel-blt-bench.exe
 
 OBJECTS     = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
 
@@ -66,7 +69,7 @@ $(CFG_VAR)/%.obj: %.c
 $(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj
 	$(LINK) /NOLOGO /OUT:$@ $< $(CFG_VAR)/utils.obj $(TEST_LDADD)
 
-all: $(OBJECTS) $(TESTS)
+all: $(OBJECTS) $(TESTS) $(BENCHMARKS)
 	@exit 0
 
 clean:
diff --git a/pixman/test/lowlevel-blt-bench.c b/pixman/test/lowlevel-blt-bench.c
index d58587d51..099e434f0 100644
--- a/pixman/test/lowlevel-blt-bench.c
+++ b/pixman/test/lowlevel-blt-bench.c
@@ -22,16 +22,13 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
 
-#include "pixman-private.h"
 #include "utils.h"
 
 #define SOLID_FLAG 1
diff --git a/pixman/test/region-contains-test.c b/pixman/test/region-contains-test.c
new file mode 100644
index 000000000..d761c4bdf
--- /dev/null
+++ b/pixman/test/region-contains-test.c
@@ -0,0 +1,169 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static void
+make_random_region (pixman_region32_t *region)
+{
+    int n_boxes;
+
+    pixman_region32_init (region);
+
+    n_boxes = lcg_rand_n (64);
+    while (n_boxes--)
+    {
+	int32_t x1, y1, x2, y2;
+
+	x1 = (int32_t)lcg_rand_u32();
+	y1 = (int32_t)lcg_rand_u32();
+	x2 = (int32_t)lcg_rand_u32();
+	y2 = (int32_t)lcg_rand_u32();
+
+	pixman_region32_union_rect (region, region, x1, y1, x2, y2);
+    }
+}
+
+static void
+print_box (pixman_box32_t *box)
+{
+    printf ("    %d %d %d %d\n", box->x1, box->y1, box->x2, box->y2);
+}
+
+static int32_t
+random_coord (pixman_region32_t *region, pixman_bool_t x)
+{
+    pixman_box32_t *b, *bb;
+    int n_boxes;
+    int begin, end;
+
+    if (lcg_rand_n (14))
+    {
+	bb = pixman_region32_rectangles (region, &n_boxes);
+	if (n_boxes == 0)
+	    goto use_extent;
+	b = bb + lcg_rand_n (n_boxes);
+    }
+    else
+    {
+    use_extent:
+	b = pixman_region32_extents (region);
+	n_boxes = 1;
+    }
+
+    if (x)
+    {
+	begin = b->x1;
+	end = b->x2;
+    }
+    else
+    {
+	begin = b->y1;
+	end = b->y2;
+    }
+
+    switch (lcg_rand_n (5))
+    {
+    case 0:
+	return begin - lcg_rand_u32();
+    case 1:
+	return end + lcg_rand_u32 ();
+    case 2:
+	return end;
+    case 3:
+	return begin;
+    default:
+	return (begin + end) / 2;
+    }
+    return 0;
+}
+
+static uint32_t
+compute_crc32_u32 (uint32_t crc32, uint32_t v)
+{
+    if (!is_little_endian())
+    {
+	v = ((v & 0xff000000) >> 24)	|
+	    ((v & 0x00ff0000) >> 8)	|
+	    ((v & 0x0000ff00) << 8)	|
+	    ((v & 0x000000ff) << 24);
+    }
+
+    return compute_crc32 (crc32, &v, sizeof (int32_t));
+}
+
+static uint32_t
+crc32_box32 (uint32_t crc32, pixman_box32_t *box)
+{
+    crc32 = compute_crc32_u32 (crc32, box->x1);
+    crc32 = compute_crc32_u32 (crc32, box->y1);
+    crc32 = compute_crc32_u32 (crc32, box->x2);
+    crc32 = compute_crc32_u32 (crc32, box->y2);
+
+    return crc32;
+}
+
+static uint32_t
+test_region_contains_rectangle (int i, int verbose)
+{
+    pixman_box32_t box;
+    pixman_box32_t rbox = { 0, 0, 0, 0 };
+    pixman_region32_t region;
+    uint32_t r, r1, r2, r3, r4, crc32;
+
+    lcg_srand (i);
+
+    make_random_region (&region);
+
+    box.x1 = random_coord (&region, TRUE);
+    box.x2 = box.x1 + lcg_rand_u32 ();
+    box.y1 = random_coord (&region, FALSE);
+    box.y2 = box.y1 + lcg_rand_u32 ();
+
+    if (verbose)
+    {
+	int n_rects;
+	pixman_box32_t *boxes;
+
+	boxes = pixman_region32_rectangles (&region, &n_rects);
+
+	printf ("region:\n");
+	while (n_rects--)
+	    print_box (boxes++);
+	printf ("box:\n");
+	print_box (&box);
+    }
+
+    crc32 = 0;
+
+    r1 = pixman_region32_contains_point (&region, box.x1, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r2 = pixman_region32_contains_point (&region, box.x1, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r3 = pixman_region32_contains_point (&region, box.x2, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r4 = pixman_region32_contains_point (&region, box.x2, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+
+    r = pixman_region32_contains_rectangle (&region, &box);
+    r = (i << 8) | (r << 4) | (r1 << 3) | (r2 << 2) | (r3 << 1) | (r4 << 0);
+
+    crc32 = compute_crc32_u32 (crc32, r);
+
+    if (verbose)
+	printf ("results: %d %d %d %d %d\n", (r & 0xf0) >> 4, r1, r2, r3, r4);
+
+    pixman_region32_fini (&region);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("region_contains",
+			     1000000,
+			     0x86311506,
+			     test_region_contains_rectangle,
+			     argc, argv);
+}
diff --git a/pixman/test/scaling-helpers-test.c b/pixman/test/scaling-helpers-test.c
index c1861389b..a38cac544 100644
--- a/pixman/test/scaling-helpers-test.c
+++ b/pixman/test/scaling-helpers-test.c
@@ -4,7 +4,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include "utils.h"
-#include "pixman-fast-path.h"
+#include "pixman-inlines.h"
 
 /* A trivial reference implementation for
  * 'bilinear_pad_repeat_get_scanline_bounds'
diff --git a/pixman/test/utils.c b/pixman/test/utils.c
index 402560227..44188ea90 100644
--- a/pixman/test/utils.c
+++ b/pixman/test/utils.c
@@ -130,6 +130,14 @@ compute_crc32 (uint32_t    in_crc32,
     return (crc32 ^ 0xFFFFFFFF);
 }
 
+pixman_bool_t
+is_little_endian (void)
+{
+    volatile uint16_t endian_check_var = 0x1234;
+
+    return (*(volatile uint8_t *)&endian_check_var == 0x34);
+}
+
 /* perform endian conversion of pixel data
  */
 void
@@ -142,8 +150,7 @@ image_endian_swap (pixman_image_t *img)
     int i, j;
 
     /* swap bytes only on big endian systems */
-    volatile uint16_t endian_check_var = 0x1234;
-    if (*(volatile uint8_t *)&endian_check_var != 0x12)
+    if (is_little_endian())
 	return;
 
     if (bpp == 8)
diff --git a/pixman/test/utils.h b/pixman/test/utils.h
index 615ad7841..f0c9c300c 100644
--- a/pixman/test/utils.h
+++ b/pixman/test/utils.h
@@ -44,10 +44,14 @@ lcg_rand_N (int max)
 static inline uint32_t
 lcg_rand_u32 (void)
 {
-    uint32_t lo = lcg_rand();
-    uint32_t hi = lcg_rand();
-
-    return (hi << 16) | lo;
+    /* This uses the 10/11 most significant bits from the 3 lcg results
+     * (and mixes them with the low from the adjacent one).
+     */
+    uint32_t lo = lcg_rand() >> -(32 - 15 - 11 * 2);
+    uint32_t mid = lcg_rand() << (32 - 15 - 11 * 1);
+    uint32_t hi = lcg_rand() << (32 - 15 - 11 * 0);
+
+    return (hi ^ mid ^ lo);
 }
 
 /* CRC 32 computation
@@ -57,6 +61,10 @@ compute_crc32 (uint32_t    in_crc32,
 	       const void *buf,
 	       size_t      buf_len);
 
+/* Returns TRUE if running on a little endian system */
+pixman_bool_t
+is_little_endian (void);
+
 /* perform endian conversion of pixel data
  */
 void
-- 
cgit v1.2.3